rfc9520xml2.original.xml   rfc9520.xml 
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="rfc7991bis.rnc"?>
<!-- <?xml-stylesheet type="text/xsl" href="rfc2629.xslt" ?> -->
<!DOCTYPE rfc [ <!DOCTYPE rfc [
<!ENTITY nbsp "&#160;"> <!ENTITY nbsp "&#160;">
<!ENTITY zwsp "&#8203;"> <!ENTITY zwsp "&#8203;">
<!ENTITY nbhy "&#8209;"> <!ENTITY nbhy "&#8209;">
<!ENTITY ouml "&#246;">
<!ENTITY uuml "&#252;">
<!ENTITY wj "&#8288;"> <!ENTITY wj "&#8288;">
<!-- One method to get references from the online citation libraries.
There has to be one entity for each item to be referenced.
An alternate method (rfc include) is described in the references. -->
<!ENTITY RFC0882 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.0882.xml">
<!ENTITY RFC0883 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.0883.xml">
<!ENTITY RFC1034 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.1034.xml">
<!ENTITY RFC1035 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.1035.xml">
<!ENTITY RFC2119 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.2119.xml">
<!ENTITY RFC2308 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.2308.xml">
<!ENTITY RFC4035 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.4035.xml">
<!ENTITY RFC4686 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.4686.xml">
<!ENTITY RFC4697 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.4697.xml">
<!ENTITY RFC4732 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.4732.xml">
<!ENTITY RFC5452 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.5452.xml">
<!ENTITY RFC6891 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.6891.xml">
<!ENTITY RFC7766 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.7766.xml">
<!ENTITY RFC7873 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.7873.xml">
<!ENTITY RFC7858 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.7858.xml">
<!ENTITY RFC8174 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.8174.xml">
<!ENTITY RFC8484 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.8484.xml">
<!ENTITY RFC8767 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.8767.xml">
<!ENTITY RFC8914 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.8914.xml">
<!ENTITY RFC9250 PUBLIC "" "https://xml2rfc.ietf.org/public/rfc/bibxml/reference
.RFC.9250.xml">
]> ]>
<?xml-stylesheet type="text/xsl" href="rfc2629.xslt"?>
<?rfc strict="yes" ?>
<?rfc toc="yes"?>
<?rfc tocdepth="4"?>
<?rfc symrefs="yes"?>
<?rfc sortrefs="yes" ?>
<?rfc compact="yes" ?>
<?rfc subcompact="no" ?>
<rfc xmlns:xi="http://www.w3.org/2001/XInclude" category="std" docName="draft-ie
tf-dnsop-caching-resolution-failures-08" ipr="trust200902" consensus="true" upda
tes="2308, 4035, 4697" submissionType="IETF">
<!-- category values: std, bcp, info, exp, and historic
ipr values: full3667, noModification3667, noDerivatives3667
you can add the attributes updates="NNNN" and obsoletes="NNNN"
they will automatically be output with "(if approved)" -->
<!-- ***** FRONT MATTER ***** --> <rfc xmlns:xi="http://www.w3.org/2001/XInclude" submissionType="IETF" category=" std" consensus="true" docName="draft-ietf-dnsop-caching-resolution-failures-08" number="9520" ipr="trust200902" updates="2308, 4035, 4697" obsoletes="" tocInclu de="true" tocDepth="4" symRefs="true" sortRefs="true" xml:lang="en" version="3">
<front> <front>
<!-- The abbreviated title is used in the page header - it is only necessary
if the
full title is longer than 39 characters -->
<title abbrev="Caching Resolution Failures">Negative Caching of DNS Resoluti on Failures</title> <title abbrev="Caching Resolution Failures">Negative Caching of DNS Resoluti on Failures</title>
<seriesInfo name="RFC" value="9520"/>
<!-- add 'role="editor"' below for the editors if appropriate -->
<!-- Another author who claims to be an editor -->
<author fullname="Duane Wessels" initials="D." surname="Wessels"> <author fullname="Duane Wessels" initials="D." surname="Wessels">
<organization>Verisign</organization> <organization>Verisign</organization>
<address> <address>
<postal> <postal>
<street>12061 Bluemont Way</street> <street>12061 Bluemont Way</street>
<city>Reston</city> <city>Reston</city>
<region>VA</region> <region>VA</region>
<code>20190</code> <code>20190</code>
<country>US</country> <country>United States of America</country>
</postal> </postal>
<phone>+1 703 948-3200</phone> <phone>+1 703 948-3200</phone>
<email>dwessels@verisign.com</email> <email>dwessels@verisign.com</email>
<uri>https://verisign.com</uri> <uri>https://verisign.com</uri>
</address> </address>
</author> </author>
<author fullname="William Carroll" initials="W." surname="Carroll"> <author fullname="William Carroll" initials="W." surname="Carroll">
<organization>Verisign</organization> <organization>Verisign</organization>
<address> <address>
<postal> <postal>
<street>12061 Bluemont Way</street> <street>12061 Bluemont Way</street>
<city>Reston</city> <city>Reston</city>
<region>VA</region> <region>VA</region>
<code>20190</code> <code>20190</code>
<country>US</country> <country>United States of America</country>
</postal> </postal>
<phone>+1 703 948-3200</phone> <phone>+1 703 948-3200</phone>
<email>wicarroll@verisign.com</email> <email>wicarroll@verisign.com</email>
<uri>https://verisign.com</uri> <uri>https://verisign.com</uri>
</address> </address>
</author> </author>
<author fullname="Matthew Thomas" initials="M." surname="Thomas"> <author fullname="Matthew Thomas" initials="M." surname="Thomas">
<organization>Verisign</organization> <organization>Verisign</organization>
<address> <address>
<postal> <postal>
<street>12061 Bluemont Way</street> <street>12061 Bluemont Way</street>
<city>Reston</city> <city>Reston</city>
<region>VA</region> <region>VA</region>
<code>20190</code> <code>20190</code>
<country>US</country> <country>United States of America</country>
</postal> </postal>
<phone>+1 703 948-3200</phone> <phone>+1 703 948-3200</phone>
<email>mthomas@verisign.com</email> <email>mthomas@verisign.com</email>
<uri>https://verisign.com</uri> <uri>https://verisign.com</uri>
</address> </address>
</author> </author>
<date year="2023" month="December"/>
<date year="2023"/> <area>ops</area>
<workgroup>dnsop</workgroup>
<area>General</area>
<workgroup>Internet Engineering Task Force</workgroup>
<keyword>DNS</keyword> <keyword>DNS</keyword>
<keyword>Negative</keyword> <keyword>Negative</keyword>
<keyword>Caching</keyword> <keyword>Caching</keyword>
<abstract> <abstract>
<t> <t>In the DNS, resolvers employ caching to reduce both latency for end
In the DNS, resolvers employ caching to reduce both latency for users and load on authoritative name servers. The process of resolution
end users and load on authoritative name servers. may result in one of three types of responses: (1) a response containing
The process of the requested data, (2) a response indicating the requested data does
resolution may result in one of three types of responses: (1) a not exist, or (3) a non-response due to a resolution failure in which
response containing the requested data; (2) a response indicating the resolver does not receive any useful information regarding the
the requested data does not exist; or (3) a non-response due to data's existence. This document concerns itself only with the third
a resolution failure in which the resolver does not receive any type.</t>
useful information regarding the data's existence. This document <t>RFC 2308 specifies requirements for DNS negative caching. There,
concerns itself only with the third type. caching of TYPE 2 responses is mandatory and caching of TYPE 3
</t> responses is optional. This document updates RFC 2308 to require
<t> negative caching for DNS resolution failures.</t>
RFC 2308 specifies requirements for DNS <t>RFC 4035 allows DNSSEC validation failure caching. This document
negative caching. There, caching of type (2) responses updates RFC 4035 to require caching for DNSSEC validation failures.</t>
is mandatory <t>RFC 4697 prohibits aggressive requerying for NS records at a failed
and caching of type (3) responses zone's parent zone. This document updates RFC 4697 to expand this
is optional. This document updates RFC 2308 requirement to all query types and to all ancestor zones.
to require negative caching
for DNS resolution failures.
</t>
<t>
RFC 4035 allows DNSSEC validation failure caching. This document updates
RFC 4035
to require caching for DNSSEC validation failures.
</t>
<t>
RFC 4697 prohibits aggressive requerying for NS records at a failed zone
's parent
zone. This document updates RFC 4697 to expand this requirement to all q
uery types and to all
ancestor zones.
</t> </t>
</abstract> </abstract>
</front> </front>
<middle> <middle>
<section>
<section title="Introduction"> <name>Introduction</name>
<t>Caching has always been a fundamental component of DNS resolution on
<t> the Internet. For example, <xref target="RFC0882"/> states:</t>
Caching has always been a fundamental component of DNS resolution <blockquote>
on the Internet. For example <xref target="RFC0882"/> states: The sheer size of the database and frequency of updates suggest
</t>
<t>
"The sheer size of the database and frequency of updates suggest
that it must be maintained in a distributed manner, with local that it must be maintained in a distributed manner, with local
caching to improve performance." caching to improve performance.
</t> </blockquote>
<t> <t>The early DNS RFCs (<xref target="RFC0882"/>, <xref
The early DNS RFCs (<xref target="RFC0882"/>, <xref target="RFC0883"/>, <xref target="RFC1034"/>, and <xref
target="RFC0883"/>, <xref target="RFC1034"/>, and <xref target="RFC1035"/>) primarily discuss caching in the context of what
target="RFC1035"/>) primarily discuss caching in the context <xref target="RFC2308"/> calls "positive responses", that is, when the
of what <xref target="RFC2308"/> calls "positive" responses, response includes the requested data. In this case, a TTL is associated
that is, when the response includes the requested data. with each Resource Record (RR) in the response. Resolvers can cache and
In this case, a TTL is associated with each resource record in reuse the data until the TTL expires.
the response. Resolvers can cache and reuse the data until the
TTL expires.
</t> </t>
<t> <t>
Section 4.3.4 of <xref target="RFC1034"/> describes negative <xref target="RFC1034" sectionFormat="of" section="4.3.4"/> describes ne gative
response caching, but notes it is optional and only talks response caching, but notes it is optional and only talks
about name errors (NXDOMAIN). This is the origin of using about name errors (NXDOMAIN). This is the origin of using
the SOA MINIMUM field as a negative caching TTL. the SOA MINIMUM field as a negative caching TTL.
</t> </t>
<t> <t>
<xref target="RFC2308"/> updated <xref target="RFC1034"/> <xref target="RFC2308"/> updated <xref target="RFC1034"/> to specify
to specify new requirements for DNS negative caching, including new requirements for DNS negative caching, including making it
making it mandatory for caching resolvers to cache mandatory for caching resolvers to cache name error (NXDOMAIN) and no
name error (NXDOMAIN) and no data (NODATA) responses data (NODATA) responses when an SOA record is available to provide a
when a SOA record is available to provide a TTL. TTL. <xref target="RFC2308"/> further specified optional negative
<xref target="RFC2308"/> further specified optional negative caching for caching for two DNS resolution failure cases: server failure and dead/un
two DNS reachable servers.
resolution failure cases: server failure and dead / unreachable servers.
</t> </t>
<t> <t>
This document updates <xref target="RFC2308"/> to require This document updates <xref target="RFC2308"/> to require negative
negative caching of all DNS resolution failures caching of all DNS resolution failures and provides additional
and provides additional examples of resolution failures. examples of resolution failures, <xref
This document also updates <xref target="RFC4035"/> to require target="RFC4035"/> to require caching for DNSSEC validation failures,
caching for DNSSEC validation failures as well as <xref target="RFC4697" as well as <xref target="RFC4697"/> to expand the scope of prohibiting
/> aggressive requerying for NS records at a failed zone's parent zone to
to expand the scope of prohibiting aggressive requerying for NS all query types and to all ancestor zones.
records at a failed zone's parent zone to all query types and
to all ancestor zones.
</t> </t>
<section>
<section title="Motivation"> <name>Motivation</name>
<t> <t>
Operators of DNS services have known for some time that Operators of DNS services have known for some time that
recursive resolvers become more aggressive when they recursive resolvers become more aggressive when they
experience resolution failures. A number of different experience resolution failures. A number of different
anecdotes, experiments, and incidents support this anecdotes, experiments, and incidents support this
claim. claim.
</t> </t>
<t> <t>
In December 2009, a secondary server for a number of In December 2009, a secondary server for a number of
in-addr.arpa subdomains saw its traffic suddenly double, and in-addr.arpa subdomains saw its traffic suddenly double, and
queries of type DNSKEY in particular increase by approximately queries of type DNSKEY in particular increase by approximately
two orders of magnitude, coinciding with a DNSSEC key rollover two orders of magnitude, coinciding with a DNSSEC key rollover
by the zone operator <xref target="roll-over-and-die"/>. by the zone operator <xref target="DNSSEC-ROLLOVER"/>.
This predated a signed root zone and an operating system This predated a signed root zone, and an operating system
vendor was providing non-root trust anchors to the recursive vendor was providing non-root trust anchors to the recursive
resolver, which became out of date following the rollover. resolver, which became out of date following the rollover.
Unable to validate responses for the affected in-addr.arpa Unable to validate responses for the affected in-addr.arpa
zones, recursive resolvers aggressively retried their queries. zones, recursive resolvers aggressively retried their queries.
</t> </t>
<t> <t>
In 2016, the internet infrastructure company Dyn experienced In 2016, the Internet infrastructure company Dyn experienced
a large attack that impacted many high-profile customers. a large attack that impacted many high-profile customers.
As documented in a technical presentation detailing the attack <xref t As documented in a technical presentation detailing the attack (see <x
arget="dyn-attack"/>, Dyn staff wrote: ref target="RETRY-STORM"/>), Dyn staff wrote:</t>
"At this point we are now experiencing botnet attack traffic
and what is best classified as a 'retry storm'. Looking at <blockquote><t>At this point we are now experiencing botnet attack
certain large recursive platforms &gt; 10x normal volume." traffic and what is best classified as a "retry storm"</t>
</t> <t>Looking at certain large recursive platforms &gt; 10x normal
volume</t></blockquote>
<t> <t>
In 2018 the root zone key signing key (KSK) was rolled over In 2018, the root zone Key Signing Key (KSK) was rolled over
<xref target="root-ksk-roll"/>. Throughout the rollover <xref target="KSK-ROLLOVER"/>. Throughout the rollover
period, the root servers experienced a significant increase in period, the root servers experienced a significant increase in
DNSKEY queries. Before the rollover, a.root-servers.net and DNSKEY queries. Before the rollover, a.root-servers.net and
j.root-servers.net together received about 15 million DNSKEY j.root-servers.net together received about 15 million DNSKEY
queries per day. At the end of the revocation period, they queries per day. At the end of the revocation period, they
received 1.2 billion per day -- an 80x increase. Removal of received 1.2 billion per day: an 80x increase. Removal of
the revoked key from the zone caused DNSKEY queries to drop the revoked key from the zone caused DNSKEY queries to drop
to post-rollover but pre-revoke levels, indicating there is to post-rollover but pre-revoke levels, indicating there is
still a population of recursive resolvers using the previous still a population of recursive resolvers using the previous
root trust anchor and aggressively retrying DNSKEY queries. root trust anchor and aggressively retrying DNSKEY queries.
</t> </t>
<t> <t>
In 2021, Verisign researchers used botnet query traffic In 2021, Verisign researchers used botnet query traffic to
to demonstrate that certain large, public recursive DNS demonstrate that certain large public recursive DNS services exhibit
services exhibit very high query rates when all authoritative very high query rates when all authoritative name servers for a zone
name servers for a zone return REFUSED or SERVFAIL <xref return refused (REFUSED) or server failure (SERVFAIL) responses (see
target="botnet"/>. When the authoritative servers were configured norm <xref target="BOTNET"/>). When the authoritative servers were
ally, query rates for configured normally, query rates for a single botnet domain averaged
a single botnet domain averaged approximately 50 queries approximately 50 queries per second. However, with the servers
per second. However, with the servers configured to return SERVFAIL, configured to return SERVFAIL, the query rate increased to 60,000
the query rate increased to 60,000 per second. Furthermore, per second. Furthermore, increases were also observed at the root
increases were also observed at the Root and TLD levels, and Top-Level Domain (TLD) levels, even though delegations at those
even though delegations at those levels were unchanged and levels were unchanged and continued operating normally.
continued operating normally.
</t> </t>
<t> <t>
Later that same year, on October 4, Facebook experienced a Later that same year, on October 4, Facebook experienced a
widespread and well-publicized outage <xref target="fb-outage"/>. Duri widespread and well-publicized outage <xref
ng the 6-hour outage, target="FB-OUTAGE"/>. During the 6-hour outage, none of Facebook's
none of Facebook's authoritative name servers were reachable and authoritative name servers were reachable and did not respond to
did not respond to queries. Recursive name servers attempting to queries. Recursive name servers attempting to resolve Facebook
resolve Facebook domains experienced timeouts. During this time, domains experienced timeouts. During this time, query traffic on the
query traffic on the .COM/.NET infrastructure increased from .COM/.NET infrastructure increased from 7,000 to 900,000 queries per
7,000 to 900,000 queries per second <xref target="fb-outage-verisign"/ second <xref target="OUTAGE-RESOLVER"/>.
>.
</t> </t>
</section> </section>
<section>
<section title="Related Work"> <name>Related Work</name>
<t> <t>
<xref target="RFC2308"/> describes negative caching for four <xref target="RFC2308"/> describes negative caching for four types
types of DNS queries and responses: Name errors, no data, of DNS queries and responses: name errors, no data, server failures,
server failures, and dead / unreachable servers. It places and dead/unreachable servers. It places the strongest
the strongest requirements on negative caching requirements on negative caching for name errors and no data
for name errors and no data responses, while server failures responses, while server failures and dead servers are left as
and dead servers are left as optional. optional.
</t> </t>
<t> <t>
<xref target="RFC4697"/> is a Best Current Practice that <xref target="RFC4697"/> is a Best Current Practice that
documents observed resolution misbehaviors. It describes a documents observed resolution misbehaviors. It describes a
number of situations that can lead to excessive queries from number of situations that can lead to excessive queries from
recursive resolvers, including: requerying for delegation data, recursive resolvers, including requerying for delegation data,
lame servers, responses blocked by firewalls, and records lame servers, responses blocked by firewalls, and records
with zero TTL. <xref target="RFC4697"/> makes a number of with zero TTL. <xref target="RFC4697"/> makes a number of
recommendations, varying from "SHOULD" to "MUST." recommendations, varying from "<bcp14>SHOULD</bcp14>" to "<bcp14>MUST< /bcp14>".
</t> </t>
<t> <t><xref target="I-D.muks-dnsop-dns-thundering-herd"/> describes "The
An expired Internet-Draft describes "The DNS thundering herd DNS thundering herd problem" as a situation arising when cached data
problem" <xref target="thundering-herd"/> as a situation arising expires at the same time for a large number of users. Although that
when cached data expires at the same time for a large number document is not focused on negative caching, it does describe the
of users. Although that document is not focused on negative benefits of combining multiple identical queries to upstream name
caching, it does describe the benefits of combining multiple, servers. That is, when a recursive resolver receives multiple queries
identical queries to upstream name servers. That is, when for the same name, class, and type that cannot be answered from cached
a recursive resolver receives multiple queries for the same data, it should combine or join them into a single upstream query
name, class, and type that cannot be answered from cached data, rather than emit repeated identical upstream queries.
it should combine or join them into a single upstream query,
rather than emit repeated, identical upstream queries.
</t> </t>
<t> <t>
<xref target="RFC5452"/>, "Measures for Making DNS More <xref target="RFC5452"/>, "<xref target="RFC5452" format="title"/>",
Resilient against Forged Answers," includes a section that includes a section that describes the phenomenon known as "Birthday
describes the phenomenon known as birthday attacks. Here, Attacks". Here, again, the problem arises when a recursive resolver
again, the problem arises when a recursive resolver emits emits multiple identical upstream queries. Multiple outstanding
multiple, identical upstream queries. Multiple outstanding queries make it easier for an attacker to guess and correctly match
queries makes it easier for an attacker to guess and correctly some of the DNS message parameters, such as the port number and ID
match some of the DNS message parameters, such as the port field. This situation is further exacerbated in the case of
number and ID field. This situation is further exacerbated in the timeout-based resolution failures. Of course, DNSSEC is a suitable
case of timeout-based resolution failures. DNSSEC, of course, defense to spoofing attacks.
is a suitable defense to spoofing attacks.
</t> </t>
<t> <t>
<xref target="RFC8767"/> describes "Serving Stale Data to Improve <xref target="RFC8767"/> describes "<xref target="RFC8767"
DNS Resiliency." This permits a recursive resolver to return format="title"/>". This permits a recursive resolver to return
possibly stale data when it is unable to refresh cached, possibly stale data when it is unable to refresh cached, expired
expired data. It introduces the idea of a failure recheck data. It introduces the idea of a failure recheck timer and
timer and says: "Attempts to refresh from non-responsive or says:</t>
<blockquote>Attempts to refresh from non-responsive or
otherwise failing authoritative nameservers are recommended otherwise failing authoritative nameservers are recommended
to be done no more frequently than every 30 seconds." to be done no more frequently than every 30 seconds.</blockquote>
</t>
</section> </section>
<section>
<section title="Terminology"> <name>Terminology</name>
<t> <t> The key words "<bcp14>MUST</bcp14>", "<bcp14>MUST NOT</bcp14>",
The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "<bcp14>REQUIRED</bcp14>", "<bcp14>SHALL</bcp14>", "<bcp14>SHALL
"SHOULD", "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and NOT</bcp14>", "<bcp14>SHOULD</bcp14>", "<bcp14>SHOULD NOT</bcp14>",
"OPTIONAL" in this document are to be interpreted as described in "<bcp14>RECOMMENDED</bcp14>", "<bcp14>NOT RECOMMENDED</bcp14>",
BCP 14 <xref target="RFC2119"/> <xref target="RFC8174"/> when, and onl "<bcp14>MAY</bcp14>", and "<bcp14>OPTIONAL</bcp14>" in this document
y when, they appear in all are to be interpreted as described in BCP&nbsp;14 <xref
capitals, as shown here. target="RFC2119"/> <xref target="RFC8174"/> when, and only when, they
appear in all capitals, as shown here.
</t> </t>
<ul> <dl>
<li><t>DNS Transport: In this document, DNS transport means a protocol <dt>DNS transport:</dt>
used to transport DNS messages between a client and a server. This
includes <dd>In this document, "DNS transport" means a protocol used to
"classic DNS" transports, i.e., DNS-over-UDP and DNS-over-TCP <xref transport DNS messages between a client and a server. This includes
target="RFC1034" /> <xref target="RFC7766" />, as "classic DNS" transports, i.e., DNS-over-UDP and DNS-over-TCP <xref
well as newer encrypted DNS transports such as DNS-over-TLS <xref ta target="RFC1034"/> <xref target="RFC7766"/>, as well as newer
rget="RFC7858" />, encrypted DNS transports, such as DNS-over-TLS <xref
DNS-over-HTTPS <xref target="RFC8484" />, DNS-over-QUIC <xref target target="RFC7858"/>, DNS-over-HTTPS <xref target="RFC8484"/>,
="RFC9250" />, DNS-over-QUIC <xref target="RFC9250"/>, and similar communication of
and similar DNS messages using other protocols. Note: at the time of writing,
communication of DNS messages using other protocols. not all DNS transports are standardized for all types
NOTE: at the time of this writing not all DNS transports are standar of servers but may become standardized in the future.</dd>
dized for all types </dl>
of servers, but may become standardized in the future.</t></li>
</ul>
</section> </section>
</section> </section>
<section>
<section title="Conditions That Lead to DNS Resolution Failures"> <name>Conditions That Lead to DNS Resolution Failures</name>
<t> <t>
A DNS resolution failure occurs when none of the servers available A DNS resolution failure occurs when none of the servers available
to a resolver client provide any useful response data for a to a resolver client provide any useful response data for a
particular query name, type, and class. A response is considered particular query name, type, and class. A response is considered
useful when it provides either the requested data, a referral to a desce ndant zone, useful when it provides either the requested data, a referral to a desce ndant zone,
or an indication that no data exists at the given name. or an indication that no data exists at the given name.
</t> </t>
<t> <t>
It is common for resolvers to have multiple servers from It is common for resolvers to have multiple servers from
which to choose for a particular query. For example, which to choose for a particular query. For example,
in the case of stub-to-recursive, the stub resolver may be in the case of stub-to-recursive, the stub resolver may be
configured with multiple recursive resolver addresses. In the case of configured with multiple recursive resolver addresses. In the case of
recursive-to-authoritative, a given zone usually has more than recursive-to-authoritative, a given zone usually has more than
one name server (NS record), each of which can have multiple one name server (NS record), each of which can have multiple
IP addresses and multiple DNS transports. IP addresses and multiple DNS transports.
</t> </t>
<t> <t>
Nothing in this document prevents a resolver from retrying a Nothing in this document prevents a resolver from retrying a
query at a different server, or the same server over a different query at a different server or the same server over a different
DNS transport. In the case of timeouts, a resolver can retry the DNS transport. In the case of timeouts, a resolver can retry the
same server and DNS transport a limited number of times. same server and DNS transport a limited number of times.
</t> </t>
<t> <t>
If any one of the available servers provides a useful response, then If any one of the available servers provides a useful response, then
it is not considered a resolution failure. However, if it is not considered a resolution failure. However, if
none of the servers for a given query tuple &lt;name, type, class&gt; none of the servers for a given query tuple &lt;name, type, class&gt;
provide a useful response, the result is a resolution failure. provide a useful response, the result is a resolution failure.
</t> </t>
<t> <t>
Note that NXDOMAIN and NOERROR/NODATA responses are not conditions Note that NXDOMAIN and NOERROR/NODATA responses are not conditions
for resolution failure. In these cases, the server is providing for resolution failure. In these cases, the server is providing
a useful response, either indicating that a name does not exist, a useful response, indicating either that a name does not exist
or that no data of the requested type exists at the name. or that no data of the requested type exists at the name.
These negative responses can be cached as described in <xref These negative responses can be cached as described in <xref target="RFC
target="RFC2308"/>. 2308"/>.
</t> </t>
<t> <t>
The remainder of this section describes a number of different The remainder of this section describes a number of different
conditions that can lead to resolution failure. This section is not conditions that can lead to resolution failure. This section is not
exhaustive. Additional conditions exhaustive. Additional conditions
may be expected to cause similar resolution failures. may be expected to cause similar resolution failures.
</t> </t>
<section>
<section title="SERVFAIL Responses"> <name>SERVFAIL Responses</name>
<t> <t>
Server failure is defined in <xref target="RFC1035"/> as Server failure is defined in <xref target="RFC1035"/> as:
"The name server was unable to process this query due to a "The name server was unable to process this query due to a
problem with the name server." A server failure is signaled problem with the name server." A server failure is signaled
by setting the RCODE field to SERVFAIL. by setting the RCODE field to SERVFAIL.
</t> </t>
<t> <t>
Authoritative servers Authoritative servers return SERVFAIL when they don't have any valid
return SERVFAIL when they don't have data for a zone. For example, a secondary server has been
any valid data for a zone. For example, a secondary server has configured to serve a particular zone but is unable to retrieve or
been configured to serve a particular zone, but is unable to refresh the zone data from the primary server.
retrieve or refresh the zone data from the primary server.
</t> </t>
<t> <t>
Recursive servers return SERVFAIL in response to a Recursive servers return SERVFAIL in response to a
number of different conditions, including many described below. number of different conditions, including many described below.
</t> </t>
<t> <t>
Although the extended DNS errors method exists "primarily to extend SE Although the extended DNS errors method exists "primarily to extend
RVFAIL to SERVFAIL to provide additional information," it "does not change the
provide additional information," it "does not change the processing of processing of RCODEs" <xref target="RFC8914"/>. This document
RCODEs" operates at the level of resolution failure and does not concern
<xref target="RFC8914"/>. particular causes.
This document operates at the level of resolution failure and does not
concern particular causes.
</t> </t>
</section> </section>
<section>
<section title="REFUSED Responses"> <name>REFUSED Responses</name>
<t> <t>
A name server returns a message with the RCODE field set to REFUSED wh en it refuses to A name server returns a message with the RCODE field set to REFUSED wh en it refuses to
process the query, e.g., for policy or other reasons <xref target="RFC 1035"/>. process the query, e.g., for policy or other reasons <xref target="RFC 1035"/>.
</t> </t>
<t> <t>
Authoritative servers generally return REFUSED when processing Authoritative servers generally return REFUSED when processing
a query for which they are not authoritative. For example, a query for which they are not authoritative. For example,
a server that is configured to be authoritative for only the a server that is configured to be authoritative for only the
example.net zone, may return REFUSED in response to a query example.net zone may return REFUSED in response to a query
for example.com. for example.com.
</t> </t>
<t> <t>
Recursive servers generally return REFUSED for query Recursive servers generally return REFUSED for query
sources that do not match configured access control lists. sources that do not match configured access control lists.
For example, a server that is configured to allow queries from For example, a server that is configured to allow queries from
only 2001:db8:1::/48 may return REFUSED in response to a query only 2001:db8:1::/48 may return REFUSED in response to a query
from 2001:db8:5::1. from 2001:db8:5::1.
</t> </t>
</section> </section>
<section>
<section title="Timeouts and Unreachable Servers"> <name>Timeouts and Unreachable Servers</name>
<t> <t>
A timeout occurs when a resolver fails to receive any A timeout occurs when a resolver fails to receive any response from
response from a server within a reasonable amount of time. a server within a reasonable amount of time. Additionally, a DNS
Additionally, a DNS transport may more quickly indicate lack transport may more quickly indicate lack of reachability in a way
of reachability in a way that wouldn't be considered a timeout. that wouldn't be considered a timeout: for example, an ICMP port
For example: an ICMP port unreachable message, a TCP "connection refus unreachable message, a TCP "connection refused" error, or a TLS
ed" error, or a TLS handshake failure. handshake failure. <xref target="RFC2308"/> refers to these
<xref target="RFC2308"/> refers to these conditions collectively as "d conditions collectively as "dead / unreachable servers".
ead / unreachable
servers."
</t> </t>
<t> <t>
Note that resolver implementations may have two types of Note that resolver implementations may have two types of
timeouts: a smaller timeout which might trigger a query retry timeouts: a smaller timeout that might trigger a query retry
and a larger timeout after which the server is considered and a larger timeout after which the server is considered
unresponsive. <xref target="reqs-retries-timeouts"/> discusses unresponsive. <xref target="reqs-retries-timeouts"/> discusses
the requirements for resolvers when retrying queries. the requirements for resolvers when retrying queries.
</t> </t>
<t> <t>
Timeouts can present a particular problem for negative Timeouts can present a particular problem for negative
caching, depending on how the resolver handles multiple, caching, depending on how the resolver handles multiple
outstanding queries for the same &lt;query name, type, outstanding queries for the same &lt;query name, type,
class&gt; tuple. For example, consider a very popular class&gt; tuple. For example, consider a very popular
website in a zone whose name servers are all unresponsive. website in a zone whose name servers are all unresponsive.
A recursive resolver might receive tens or hundreds of queries A recursive resolver might receive tens or hundreds of queries
per second for the popular website. If the recursive server per second for that website. If the recursive server
implementation "joins" these outstanding queries together, implementation joins these outstanding queries together,
then it only sends one recursive-to-authoritative query for then it only sends one recursive-to-authoritative query for
the numerous pending stub-to-recursive queries. If, however, the numerous pending stub-to-recursive queries. However, if
the implementation does not join outstanding queries together, the implementation does not join outstanding queries together,
then it sends one recursive-to-authoritative query for each then it sends one recursive-to-authoritative query for each
stub-to-recursive query. If the incoming query rate is high stub-to-recursive query. If the incoming query rate is high
and the timeout is large, this might result in hundreds or and the timeout is large, this might result in hundreds or
thousands of recursive-to-authoritative queries while waiting thousands of recursive-to-authoritative queries while waiting
for an authoritative server to time out. for an authoritative server to time out.
</t> </t>
<t> <t>
A recursive resolver that does not join outstanding queries A recursive resolver that does not join outstanding queries together
together is more susceptible to birthday attacks (<xref is more susceptible to Birthday Attacks (<xref target="RFC5452"
target="RFC5452"/> Section 5), especially when those queries sectionFormat="comma" section="5"/>), especially when those queries
result in timeouts. result in timeouts.
</t> </t>
</section> </section>
<section title="Delegation Loops"> <section>
<name>Delegation Loops</name>
<t> <t>
A delegation loop, or cycle, can occur when one domain utilizes A delegation loop, or cycle, can occur when one domain utilizes
name servers in a second domain, and the second domain uses name servers in a second domain, and the second domain uses
name servers in the first. For example: name servers in the first. For example:
</t> </t>
<figure><artwork align="left"><![CDATA[ <sourcecode type="dns-rr"><![CDATA[
FOO.EXAMPLE. NS NS1.EXAMPLE.COM. FOO.EXAMPLE. NS NS1.EXAMPLE.COM.
FOO.EXAMPLE. NS NS2.EXAMPLE.COM. FOO.EXAMPLE. NS NS2.EXAMPLE.COM.
EXAMPLE.COM. NS NS1.FOO.EXAMPLE. EXAMPLE.COM. NS NS1.FOO.EXAMPLE.
EXAMPLE.COM. NS NS2.FOO.EXAMPLE. EXAMPLE.COM. NS NS2.FOO.EXAMPLE.
]]></artwork></figure> ]]></sourcecode>
<t> <t>
In this example, no names under foo.example or example.com can be In this example, no names under foo.example or example.com can be
resolved because of the delegation loop. Note that a delegation loop resolved because of the delegation loop. Note that a delegation loop
may involve more than two domains. A resolver that does not may involve more than two domains. A resolver that does not
detect delegation loops may generate DDoS-levels of attack traffic detect delegation loops may generate DDoS-levels of attack traffic
to authoritative name servers, as documented in the TsuNAME vulnerabil ity to authoritative name servers, as documented in the TsuNAME vulnerabil ity
<xref target="TsuNAME"/>. <xref target="TsuNAME"/>.
</t> </t>
</section> </section>
<section>
<section title="Alias Loops"> <name>Alias Loops</name>
<t> <t>
An alias loop, or cycle, can occur when one CNAME or DNAME RR refers t o An alias loop, or cycle, can occur when one CNAME or DNAME RR refers t o
a second name, which in turn is specified as an alias for the first. a second name, which, in turn, is specified as an alias for the first.
For example: For example:
</t> </t>
<figure><artwork align="left"><![CDATA[ <sourcecode type="dns-rr"><![CDATA[
APP.FOO.EXAMPLE. CNAME APP.EXAMPLE.NET. APP.FOO.EXAMPLE. CNAME APP.EXAMPLE.NET.
APP.EXAMPLE.NET. CNAME APP.FOO.EXAMPLE. APP.EXAMPLE.NET. CNAME APP.FOO.EXAMPLE.
]]></artwork></figure> ]]></sourcecode>
<t> <t>
The need to detect CNAME loops has been known since at least The need to detect CNAME loops has been known since at least <xref
<xref target="RFC1034"/> which states in Section 3.6.2: target="RFC1034"/>, which states in Section <xref target="RFC1034"
</t> sectionFormat="bare" section="3.6.2"/>:
<t> </t>
"Of course, by the robustness principle, domain software should <blockquote>
Of course, by the robustness principle, domain software should
not fail when presented with CNAME chains or loops; CNAME chains not fail when presented with CNAME chains or loops; CNAME chains
should be followed and CNAME loops signaled as an error." should be followed and CNAME loops signalled as an error.
</t> </blockquote>
</section> </section>
<section>
<section title="DNSSEC Validation Failures"> <name>DNSSEC Validation Failures</name>
<t> <t>
For zones that are signed with DNSSEC, a resolution failure can For zones that are signed with DNSSEC, a resolution failure can
occur when a security-aware resolver believes it should be able occur when a security-aware resolver believes it should be able
to establish a chain-of-trust for an RRset but is unable to do to establish a chain of trust for an RRset but is unable to do
so, possibly after trying multiple authoritative name servers. so, possibly after trying multiple authoritative name servers.
DNSSEC validation failures may be due to signature mismatch, DNSSEC validation failures may be due to signature mismatch,
missing DNSKEY RRs, problems with denial-of-existence records, missing DNSKEY RRs, problems with denial-of-existence records,
clock skew, clock skew,
or other reasons. or other reasons.
</t> </t>
<t> <t>
Section 4.7 of <xref target="RFC4035"/> already discusses <xref target="RFC4035" sectionFormat="of" section="4.7"/> already disc usses
the requirements and reasons for caching validation failures. the requirements and reasons for caching validation failures.
<xref target="dnssec-reqs"/> of this document strengthens those requir ements. <xref target="dnssec-reqs"/> of this document strengthens those requir ements.
</t> </t>
</section> </section>
<section>
<section title="FORMERR Responses"> <name>FORMERR Responses</name>
<t> <t>
A name server returns a message with the RCODE field set to A name server returns a message with the RCODE field set to
FORMERR when it is unable to interpret the query <xref target="RFC1035 "/>. FORMERR FORMERR when it is unable to interpret the query <xref target="RFC1035 "/>. FORMERR
responses are often associated with problems processing EDNS(0) responses are often associated with problems processing Extension Mech
Extensions <xref target="RFC6891"/>. Authoritative servers anisms for DNS (EDNS(0)) <xref target="RFC6891"/>. Authoritative servers
may return FORMERR when they do not implement EDNS(0), or may return FORMERR when they do not implement EDNS(0), or
when EDNS(0) option fields are malformed, but not for unknown when EDNS(0) option fields are malformed, but not for unknown
EDNS(0) options. EDNS(0) options.
</t> </t>
<t> <t>
Upon receipt of a FORMERR response, some recursive clients will Upon receipt of a FORMERR response, some recursive clients will
retry their queries without EDNS(0), while others will not. Nonethele ss, resolution failures retry their queries without EDNS(0), while others will not. Nonethele ss, resolution failures
from FORMERR responses are rare. from FORMERR responses are rare.
</t> </t>
</section> </section>
skipping to change at line 556 skipping to change at line 488
may return FORMERR when they do not implement EDNS(0), or may return FORMERR when they do not implement EDNS(0), or
when EDNS(0) option fields are malformed, but not for unknown when EDNS(0) option fields are malformed, but not for unknown
EDNS(0) options. EDNS(0) options.
</t> </t>
<t> <t>
Upon receipt of a FORMERR response, some recursive clients will Upon receipt of a FORMERR response, some recursive clients will
retry their queries without EDNS(0), while others will not. Nonethele ss, resolution failures retry their queries without EDNS(0), while others will not. Nonethele ss, resolution failures
from FORMERR responses are rare. from FORMERR responses are rare.
</t> </t>
</section> </section>
</section> </section>
<section>
<section title="Requirements for Caching DNS Resolution Failures"> <name>Requirements for Caching DNS Resolution Failures</name>
<section anchor="reqs-retries-timeouts">
<section title="Retries and Timeouts" anchor="reqs-retries-timeouts"> <name>Retries and Timeouts</name>
<t> <t>
A resolver MUST NOT retry a given query to a server address over a giv en DNS transport more than twice A resolver <bcp14>MUST NOT</bcp14> retry a given query to a server add ress over a given DNS transport more than twice
(i.e., three queries in total) before considering the server address (i.e., three queries in total) before considering the server address
unresponsive over that DNS transport for that query. unresponsive over that DNS transport for that query.
</t> </t>
<t> <t>
A resolver MAY retry a given query over a different DNS transport to t he same server A resolver <bcp14>MAY</bcp14> retry a given query over a different DNS transport to the same server
if it has reason to believe the DNS transport is available for that se rver and is if it has reason to believe the DNS transport is available for that se rver and is
compatible with the resolver's security policies. compatible with the resolver's security policies.
</t> </t>
<t> <t>
This document does not place any requirements on how long an implement ation should This document does not place any requirements on how long an implement ation should
wait before retrying a query (aka timeout value), wait before retrying a query (aka a timeout value),
which may be implementation- or configuration-dependent. which may be implementation or configuration dependent.
It is generally expected that typical timeout values range It is generally expected that typical timeout values range
from 3 to 30 seconds. from 3 to 30 seconds.
</t> </t>
</section> </section>
<section anchor="caching">
<section title="Caching" anchor="caching"> <name>Caching</name>
<t> <t>
Resolvers MUST implement a cache for resolution failures. Resolvers <bcp14>MUST</bcp14> implement a cache for resolution failure s.
The purpose of this cache is to eliminate repeated upstream The purpose of this cache is to eliminate repeated upstream
queries that cannot be resolved. queries that cannot be resolved.
When an incoming query matches a cached resolution failure, the resolv er MUST NOT send When an incoming query matches a cached resolution failure, the resolv er <bcp14>MUST NOT</bcp14> send
any corresponding outgoing queries until after the cache entries expir e. any corresponding outgoing queries until after the cache entries expir e.
</t> </t>
<t> <t>
Implementation details for such a cache are not specified Implementation details for such a cache are not specified
in this document. The implementation might cache different in this document. The implementation might cache different
resolution failure conditions differently. For example, DNSSEC resolution failure conditions differently. For example, DNSSEC
validation failures might be cached according to the queried validation failures might be cached according to the queried
name, class, and type, whereas unresponsive servers might be name, class, and type, whereas unresponsive servers might be
cached only according to the server's IP address. cached only according to the server's IP address.
Developers should document their implementation choices so Developers should document their implementation choices so
that operators know what behaviors to expect when resolution that operators know what behaviors to expect when resolution
failures are cached. failures are cached.
</t> </t>
<t> <t>
Resolvers MUST cache resolution failures for at least 1 second. Resolvers <bcp14>MUST</bcp14> cache resolution failures for at least
Resolvers MAY cache different types of resolution failures for differe 1 second. Resolvers <bcp14>MAY</bcp14> cache different types of
nt (i.e., longer) amounts of time. resolution failures for different (i.e., longer) amounts of time.
Consistent with <xref target="RFC2308"/>, resolution failures MUST NOT Consistent with <xref target="RFC2308"/>, resolution failures
be cached for longer than <bcp14>MUST NOT</bcp14> be cached for longer than 5 minutes.
5 minutes.
</t> </t>
<t> <t>
The minimum cache duration SHOULD be configurable by the operator. The minimum cache duration <bcp14>SHOULD</bcp14> be configurable by
A longer cache duration for resolution failures will the operator. A longer cache duration for resolution failures will
reduce the processing burden from repeated queries, but reduce the processing burden from repeated queries but may also
may also increase the time to recover from transitory issues. increase the time to recover from transitory issues.
</t> </t>
<t> <t>
Resolvers SHOULD employ an exponential or linear backoff algorithm to Resolvers <bcp14>SHOULD</bcp14> employ an exponential or linear
increase the cache duration for persistent resolution failures. For ex backoff algorithm to increase the cache duration for persistent
ample, resolution failures. For example, the initial time for negatively
the initial time for negatively caching a resolution failure caching a resolution failure might be set to 5 seconds and
might be set to 5 seconds, and increased after each retry that results increased after each retry that results in another resolution
in another resolution failure, up to a configurable maximum, not to ex failure, up to a configurable maximum, not to exceed the 5-minute
ceed the 5-minute upper limit. upper limit.
</t> </t>
<t> <t>
Notwithstanding the above, resolvers SHOULD implement measures to miti Notwithstanding the above, resolvers <bcp14>SHOULD</bcp14> implement
gate resource exhaustion measures to mitigate resource exhaustion attacks on the failed
attacks on the failed resolution cache. That is, the resolver should l resolution cache. That is, the resolver should limit the amount of
imit the amount of memory memory and/or processing time devoted to this cache.
and/or processing time devoted to this cache.
</t> </t>
</section> </section>
<section>
<section title="Requerying Delegation Information"> <name>Requerying Delegation Information</name>
<t> <t>
Section 2.1 of <xref target="RFC4697"/> identifies circumstances in which <xref target="RFC4697" sectionFormat="of" section="2.1"/> identifies
"every circumstances in which:</t>
name server in a zone's NS RRSet is unreachable (e.g., during a network <blockquote>...every name server in a zone's NS RRSet is unreachable
outage), (e.g., during a network outage), unavailable (e.g., the name server
unavailable (e.g., the name server process is not running on the server process is not running on the server host), or misconfigured (e.g.,
host), or the name server is not authoritative for the given zone, also known as
misconfigured (e.g., the name server is not authoritative for the give "lame").</blockquote>
n zone, <t>It prohibits unnecessary "aggressive requerying" to the
also known as 'lame')." It prohibits unnecessary "aggressive requeryin
g" to the
parent of a non-responsive zone by sending NS queries. parent of a non-responsive zone by sending NS queries.
</t> </t>
<t> <t>
The problem of aggressive requerying to parent zones is not limited to The problem of aggressive requerying to parent zones is not limited
queries of type NS. to queries of type NS. This document updates the requirement from
This document updates the requirement from section 2.1.1 of <xref targ <xref target="RFC4697" sectionFormat="of" section="2.1.1"/> to apply
et="RFC4697"/> more generally:</t>
to apply more generally: <blockquote>
Upon encountering a zone whose name servers are all non-responsive, Upon encountering a zone whose name servers are all
a resolver MUST cache the resolution failure. non-responsive, a resolver <bcp14>MUST</bcp14> cache the resolution
Furthermore, the resolver MUST limit queries to the non-responsive failure. Furthermore, the resolver <bcp14>MUST</bcp14> limit
zone's parent zone (and to other ancestor zones) just as it queries to the non-responsive zone's parent zone (and to other
would limit subsequent queries to the non-responsive zone. ancestor zones) just as it would limit subsequent queries to the
</t> non-responsive zone.</blockquote>
</section>
<section title="DNSSEC Validation Failures" anchor="dnssec-reqs"> </section>
<section anchor="dnssec-reqs">
<name>DNSSEC Validation Failures</name>
<t> <t>
Section 4.7 of <xref target="RFC4035"/> states: <xref target="RFC4035" sectionFormat="of" section="4.7"/> states:
</t> </t>
<t> <blockquote>
To prevent such unnecessary DNS traffic, security-aware To prevent such unnecessary DNS traffic, security-aware
resolvers MAY cache data with invalid signatures, with some resolvers <bcp14>MAY</bcp14> cache data with invalid signatures, with some
restrictions. restrictions.
</t> </blockquote>
<t> <t>
This document updates <xref target="RFC4035"/> with the following, str onger requirement: This document updates <xref target="RFC4035"/> with the following, str onger, requirement:
</t> </t>
<t> <blockquote>
To prevent such unnecessary DNS traffic, security-aware To prevent such unnecessary DNS traffic, security-aware
resolvers MUST cache DNSSEC validation failures, with some resolvers <bcp14>MUST</bcp14> cache DNSSEC validation failures, with s ome
restrictions. restrictions.
</t> </blockquote>
<t> <t>
One of the restrictions mentioned in <xref target="RFC4035"/> One of the restrictions mentioned in <xref target="RFC4035"/>
is to use a small TTL when caching data that fails DNSSEC is to use a small TTL when caching data that fails DNSSEC
validation. This is, in part, because the provided TTL cannot validation. This is, in part, because the provided TTL cannot
be trusted. The advice from <xref target="caching"/> be trusted. The advice from <xref target="caching"/>
herein can be used as guidance on TTLs for caching DNSSEC herein can be used as guidance on TTLs for caching DNSSEC
validation failures. validation failures.
</t> </t>
</section> </section>
</section> </section>
<section anchor="iana">
<section title="IANA Considerations" anchor="iana"> <name>IANA Considerations</name>
<t> <t>
This document has no IANA actions. This document has no IANA actions.
</t> </t>
</section> </section>
<section anchor="security">
<section title="Security Considerations" anchor="security"> <name>Security Considerations</name>
<t> <t>
As noted in <xref target="caching"/>, an attacker might attempt a resour ce As noted in <xref target="caching"/>, an attacker might attempt a resour ce
exhaustion attack by sending queries for a large number exhaustion attack by sending queries for a large number
of names and/or types that result in resolution failure. Resolvers of names and/or types that result in resolution failure. Resolvers
SHOULD implement measures to protect themselves and bound the <bcp14>SHOULD</bcp14> implement measures to protect themselves and bound the
amount of memory devoted to caching resolution failures. amount of memory devoted to caching resolution failures.
</t> </t>
<t> <t>
A cache poisoning attack (see section 2.2 of <xref target="RFC7873"/>) A cache poisoning attack (see <xref target="RFC7873"
resulting in denial of service sectionFormat="of" section="2.2"/>) resulting in denial of service may b
may be possible because failure messages cannot be e possible
signed. An attacker might generate queries and send forged failure messa because failure messages cannot be signed. An attacker might generate
ges, queries and send forged failure messages, causing the resolver to
causing the resolver to cease sending queries to the authoritative name cease sending queries to the authoritative name server (see <xref
server target="RFC4732" sectionFormat="of" section="2.6"/> for a similar
(see 2.6 of <xref target="RFC4732"/> for a similar "data corruption atta "data corruption attack" and Section 5.2 of <xref target="TuDoor"/>
ck"). for a "DNSDoS attack"). However, this would require continued
However, this would require continued spoofing throughout the backoff pe spoofing throughout the backoff period and repeated attacks due to the
riod and required attacks 5-minute cache limit. As in <xref target="RFC4686" sectionFormat="of"
due to the 5 minute cache limit. As in section 4.1.12 of <xref target="R section="4.1.12"/>, this attack's effects would be "localized and of
FC4686"/>, limited duration".
this attack's effects would be "localized and of limited duration."
</t> </t>
</section> </section>
<section anchor="privacy">
<section title="Privacy Considerations" anchor="privacy"> <name>Privacy Considerations</name>
<t>This specification has no impact on user privacy.</t> <t>This specification has no impact on user privacy.</t>
</section> </section>
<section title="Acknowledgments" anchor="acknowledgments">
<t>
The authors wish to thank
Mukund Sivaraman,
Petr Spacek,
Peter van Dijk,
Tim Wicinksi,
Joe Abley,
Evan Hunt,
Barry Leiba,
Lucas Pardue,
Paul Wouters,
and other members of the DNSOP working group for their feedback and cont
ributions.
</t>
</section>
<section anchor="Changes" title="Change Log">
<t>RFC Editor: Please remove this section before publication.</t>
<t>This section lists substantial changes to the document as it is being w
orked on.</t>
<t>From -00 to -01:
<list style="symbols">
<t>use phrase "the initial TTL for negatively caching a resolution failu
re" instead of "negative cache TTL"</t>
<t>typos, etc</t>
</list></t>
<t>From dwmtwc-01 to ietf-00:
<list style="symbols">
<t>Adopted by WG</t>
</list></t>
<t>From -00 to -01:
<list style="symbols">
<t>Clarify retries and timeouts to apply on a per-query basis.</t>
<t>Say more about the 5 second caching requirement in TTLs section.</t>
<t>Expanded opening paragraphs of section 2, now titled "Conditions That
Lead To DNS Resolution Failures".</t>
<t>Text from the former section 3.3 ("Scope") moved to top of section 2.
</t>
<t>Section 3.2 was formerly "TTLs" and is now "Caching". The draft no l
onger requires e.g. caching by tuples, but now just requires caching failures so
that repeated queries are not sent out.</t>
<t>State that resolvers should protect themselves from cache resource ex
haustion attacks.</t>
</list></t>
<t>From -01 to -02:
<list style="symbols">
<t>Added cache poisoning attack to Security Considerations.</t>
</list></t>
<t>From -02 to -03:
<list style="symbols">
<t>Added missing reference to Verisign blog post.</t>
</list></t>
<t>From -03 to -04:
<list style="symbols">
<t>Address most of Peter van Dijk's DNS Directorate review comments.</t
>
<t>Removed "For Discussion" section from introduction referencing appar
ent inconsistent RFC2119 keyword use in RFC2308.</t>
<t>Replaced "For Discussion" section from "Requerying Delegation Inform
ation" to generalize RFC 4697 requirements not to requery parent zones to cover
all query types.</t>
<t>Replaced "For Discussion" section from "DNSSEC Validation Failures"
to strengthen RFC 4035 to require caching of DNSSEC validation failures.</t>
<t>Added RFC 4035 and RFC 4697 to updated RFCs list.</t>
<t>Added (empty) Implementation Status section.</t>
</list></t>
<t>From -04 to -05:
<list style="symbols">
<t>Expanded abstract to include updates to RFCs 4035 and 4697.</t>
<t>Removed reference to unused terms from RFC 8126.</t>
<t>Reworded "server transport" to "a server address over a given transp
ort".</t>
<t>Added explanatory text in "Server Failure" section for exclusion of
extended DNS errors</t>
<t>Changed "Timeouts" section to "Timeouts and Unreachable Servers" and
added reference to transport layer indicators from RFC 2308.</t>
<t>Clarified meaning of "timeout value".</t>
</list></t>
<t>From -05 to -06:
<list style="symbols">
<t>Changed minimum 5 second caching to 1 second, with other changes to g
ive implementors and operators more leeway.</t>
<t>Changed "exponential backoff" to more general concept of increasing b
ackoff.</t>
<t>Added some implementation status notes for BIND, from dnsop list emai
l.</t>
</list>
</t>
<t>From -06 to -07:
<list style="symbols">
<t>Artart review: minor editorial clarifications</t>
<t>Genart review: remove confusing and superfluous section references.</
t>
<t>Genart review: clarify resolution failure caching time range.</t>
<t>Genart review: better define DNS transports</t>
<t>Dnsdir review: clarify FORMERR response retries.</t>
</list>
</t>
<t>From -07 to -08:
<list style="symbols">
<t>"only exacerbated" -> "further exacerbated"</t>
<t>lowercase IPv6 addresses</t>
<t>lowercase example domain in text</t>
<t>updated introduction to include all updated RFCs</t>
<t>change 3.2 SHOULD to should</t>
<t>section 3.4: say a little about "some restrictions" from RFC 4035</t>
<t>Intdir telechat review: a few grammatical nits</t>
<t>Various IESG reviewer suggestions</t>
</list>
</t>
</section>
<section title="Implementation Status">
<t>
RFC Editor: Please remove this section before publication.
</t>
<t>
This section records the status of known implementations of the
protocol defined by this specification at the time of posting of
this Internet-Draft, and is based on a proposal described in
RFC 7942. The description of implementations in this section is
intended to assist the IETF in its decision processes in
progressing drafts to RFCs. Please note that the listing of any
individual implementation here does not imply endorsement by the
IETF. Furthermore, no effort has been spent to verify the
information presented here that was supplied by IETF contributors.
This is not intended as, and must not be construed to be, a
catalog of available implementations or their features. Readers
are advised to note that other implementations may exist.
</t>
<section title="BIND">
<t>
The following is excerpted from a message to the dnsop mailing list re
garding
how BIND caches resolution failures:
</t>
<t>
BIND implemented a SERVFAIL cache in 2014 with a default
cache duration of 10 seconds; after a slew of complaints, in 2015 we
lowered it to 1 second, and also reduced the configurable maximum from
5 minutes to 30 seconds. The reason was that certain common failure
conditions are transitory, and it's not unreasonable to prioritize
rapid recovery.
</t>
<t>
Now, to be clear, the comparison isn't exactly apples to apples: the B
IND
SERVFAIL cache is a somewhat stupider mechanism than the one outlined
in
the draft. It caches *all* SERVFAIL responses, regardless of the reaso
n
they were generated. For example: when the cache is cold, a query may
time
out or hit DDoS mitigation limits before it's finished getting through
the
whole iteration process; an immediate retry would start further along
the
delegation chain and would succeed. Such problems weren't noticeable u
ntil
we implemented the 10-second cache, but became very noticeable afterwa
rd.
</t>
<t>
If we were able to selectively cache *only* those SERVFAILs that are
unlikely to recover soon, then five seconds might indeed be a good sta
rting
point. But, with our relatively dumb cache, we found that one second d
id a
fairly good job reducing the processing burden from repeated queries,
and
eliminated the user complaints about the resolver taking forever to re
cover
from short-lived problems. It's been working well enough that it hasn'
t
been a priority to develop a more complex failure cache.
</t>
</section>
</section>
</middle> </middle>
<back> <back>
<references title="Normative References"> <displayreference target="I-D.muks-dnsop-dns-thundering-herd" to="THUNDERING-HER
&RFC1034; D"/>
&RFC1035;
&RFC2119;
&RFC2308;
&RFC4035;
&RFC4697;
&RFC8174;
</references>
<references title="Informative References"> <references>
&RFC0882; <name>References</name>
&RFC0883; <references>
&RFC4686; <name>Normative References</name>
&RFC4732; <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.1
&RFC5452; 034.xml"/>
&RFC6891; <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.1
&RFC7766; 035.xml"/>
&RFC7858; <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.2
&RFC7873; 119.xml"/>
&RFC8484; <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.2
&RFC8767; 308.xml"/>
&RFC8914; <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.4
&RFC9250; 035.xml"/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.4
697.xml"/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.8
174.xml"/>
</references>
<references>
<name>Informative References</name>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.0
882.xml"/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.0
883.xml"/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.4
686.xml"/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.4
732.xml"/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.5
452.xml"/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.6
891.xml"/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.7
766.xml"/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.7
858.xml"/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.7
873.xml"/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.8
484.xml"/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.8
767.xml"/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.8
914.xml"/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.9
250.xml"/>
<reference anchor="botnet" target="https://indico.dns-oarc.net/event/38/con tributions/841/"> <reference anchor="TuDoor" target="https://doi.ieeecomputersociety.org/10. 1109/SP54263.2024.00046">
<front> <front>
<title>Botnet Traffic Observed at Various Levels of the DNS Hierarchy< <title>TuDoor Attack: Systematically Exploring and Exploiting Logic Vu
/title> lnerabilities in DNS Response Pre-processing with Malformed Packets</title>
<author initials="D." surname="Wessels" fullname="Duane Wessels"/> <author fullname="Xiang Li" initials="X." surname="Li"/>
<author initials="M." surname="Thomas" fullname="Matt Thomas"/> <author fullname="Wei Xu" initials="W." surname="Xu"/>
<date year="2021" month="May"/> <author fullname="Baojun Liu" initials="B." surname="Liu"/>
<author fullname="Mingming Zhang" initials="M." surname="Zhang"/>
<author fullname="Zhou Li" initials="Z." surname="Li"/>
<author fullname="Jia Zhang" initials="J." surname="Zhang"/>
<author fullname="Deliang Chang" initials="D." surname="Chang"/>
<author fullname="Xiaofeng Zheng" initials="X." surname="Zheng"/>
<author fullname="Chuhan Wang" initials="C." surname="Wang"/>
<author fullname="Jianjun Chen" initials="J." surname="Chen"/>
<author fullname="Haixin Duan" initials="H." surname="Duan"/>
<author fullname="Qi Li" initials="Q." surname="Li"/>
<date year="2024"/>
</front> </front>
</reference> <refcontent>IEEE Symposium on Security and Privacy (SP)</refcontent>
<seriesInfo name="DOI" value="10.1109/SP54263.2024.00046"/>
</reference>
<reference anchor="fb-outage" target="https://engineering.fb.com/2021/10/05 <reference anchor="BOTNET" target="https://indico.dns-oarc.net/event/38/
/networking-traffic/outage-details/"> contributions/841/">
<front> <front>
<title>More details about the October 4 outage</title> <title>Botnet Traffic Observed at Various Levels of the DNS Hierarch
<author initials="S." surname="Janardhan" fullname="Santosh Janardhan" y</title>
/> <author initials="D." surname="Wessels" fullname="Duane Wessels"/>
<date year="2021" month="October"/> <author initials="M." surname="Thomas" fullname="Matt Thomas"/>
</front> <date year="2021" month="May"/>
</reference> </front>
</reference>
<reference anchor="fb-outage-verisign" target="https://blog.verisign.com/se <reference anchor="FB-OUTAGE" target="https://engineering.fb.com/2021/10
curity/facebook-dns-outage/"> /05/networking-traffic/outage-details/">
<front> <front>
<title>Observations on Resolver Behavior During DNS Outages</title> <title>More details about the October 4 outage</title>
<author> <author initials="S." surname="Janardhan" fullname="Santosh Janardha
<organization>Verisign</organization> n"/>
</author> <date year="2021" month="October"/>
<date year="2022" month="January" day="20"/> </front>
</front> </reference>
</reference>
<reference anchor="TsuNAME" target="https://dl.acm.org/doi/10.1145/3487552. <reference anchor="OUTAGE-RESOLVER" target="https://blog.verisign.com/se
3487824"> curity/facebook-dns-outage/">
<front> <front>
<title>TsuNAME: exploiting misconfiguration and vulnerability to DDoS <title>Observations on Resolver Behavior During DNS Outages</title>
DNS</title> <author>
<author initials="G. C. M." surname="Moura" fullname="Giovane C. M. Mo <organization>Verisign</organization>
ura"/> </author>
<author initials="S." surname="Castro" fullname="Sebastian Castro"/> <date year="2022" month="January"/>
<author initials="J." surname="Heidemann" fullname="John Heidemann"/> </front>
<author initials="W." surname="Hardaker" fullname="Wes Hardaker"/> </reference>
<date year="2021" month="November"/>
</front>
</reference>
<reference anchor="roll-over-and-die" target="https://www.potaroo.net/ispco <reference anchor="TsuNAME">
l/2010-02/rollover.html"> <front>
<front> <title>TsuNAME: exploiting misconfiguration and vulnerability to DDo
<title>Roll Over and Die?</title> S DNS</title>
<author initials="G." surname="Michaleson" fullname="George Michaleson <author initials="G. C. M." surname="Moura" fullname="Giovane C. M.
"/> Moura"/>
<author initials="P." surname="Wallstr&ouml;m" fullname="Patrik Wallst <author initials="S." surname="Castro" fullname="Sebastian Castro"/>
r&ouml;m"/> <author initials="J." surname="Heidemann" fullname="John Heidemann"/
<author initials="R." surname="Arends" fullname="Roy Arends"/> >
<author initials="G." surname="Huston" fullname="Geoff Huston"/> <author initials="W." surname="Hardaker" fullname="Wes Hardaker"/>
<date year="2010" month="February"/> <date year="2021" month="November"/>
</front> </front>
</reference> <refcontent>IMC '21: Proceedings of the 21st ACM Internet
Measurement Conference, Pages 398-418</refcontent>
<seriesInfo name="DOI" value="10.1145/3487552.3487824"/>
</reference>
<reference anchor="dyn-attack" target="https://ccnso.icann.org/sites/defaul <reference anchor="DNSSEC-ROLLOVER" target="https://www.potaroo.net/ispc
t/files/file/field-file-attach/2017-04/presentation-oracle-dyn-ddos-dns-13mar17- ol/2010-02/rollover.html">
en.pdf"> <front>
<front> <title>Roll Over and Die?</title>
<title>Dyn, DDoS, and DNS</title> <author initials="G." surname="Michaleson" fullname="George Michales
<author initials="A." surname="Sullivan" fullname="Andrew Sullivan"/> on"/>
<date year="2017" month="March"/> <author initials="P." surname="Wallström" fullname="Patrik Wallström
</front> "/>
</reference> <author initials="R." surname="Arends" fullname="Roy Arends"/>
<author initials="G." surname="Huston" fullname="Geoff Huston"/>
<date year="2010" month="February"/>
</front>
</reference>
<reference anchor="root-ksk-roll" target="https://dl.acm.org/doi/10.1145/33 <reference anchor="RETRY-STORM" target="https://ccnso.icann.org/sites/de
55369.3355570"> fault/files/file/field-file-attach/2017-04/presentation-oracle-dyn-ddos-dns-13ma
<front> r17-en.pdf">
<title>Roll, Roll, Roll Your Root: A Comprehensive Analysis of the Fir <front>
st Ever DNSSEC Root KSK Rollover</title> <title>Dyn, DDoS, and DNS</title>
<author fullname="Moritz M&uuml;ller" initials="M." surname="M&uuml;ll <author initials="A." surname="Sullivan" fullname="Andrew Sullivan"/
er"/> >
<author fullname="Matthew Thomas" initials="M." surname="Thomas"/> <date year="2017" month="March"/>
<author fullname="Duane Wessels" initials="D." surname="Wessels"/> </front>
<author fullname="Wes Hardaker" initials="W." surname="Hardaker"/> </reference>
<author fullname="Taejoong Chung" initials="T." surname="Chung"/>
<author fullname="Willem Toorop" initials="W." surname="Toorop"/>
<author fullname="Roland van Rijswijk-Deij" initials="R.v." surname="R
ijswijk-Deij"/>
<date year="2019" month="Oct"/>
</front>
</reference>
<reference anchor="thundering-herd" target="https://datatracker.ietf.org/d <reference anchor="KSK-ROLLOVER">
oc/draft-muks-dnsop-dns-thundering-herd/"> <front>
<front> <title>Roll, Roll, Roll Your Root: A Comprehensive Analysis of the F
<title>The DNS thundering herd problem (expired Internet-Draft)</title irst Ever DNSSEC Root KSK Rollover</title>
> <author fullname="Moritz Müller" initials="M." surname="Müller"/>
<author fullname="Mukund Sivaraman" initials="M." surname="Sivaraman"/ <author fullname="Matthew Thomas" initials="M." surname="Thomas"/>
> <author fullname="Duane Wessels" initials="D." surname="Wessels"/>
<author fullname="Cricket Liu" initials="C." surname="Liu"/> <author fullname="Wes Hardaker" initials="W." surname="Hardaker"/>
<date year="2020" month="Jun"/> <author fullname="Taejoong Chung" initials="T." surname="Chung"/>
</front> <author fullname="Willem Toorop" initials="W." surname="Toorop"/>
</reference> <author fullname="Roland van Rijswijk-Deij" initials="R." surname="v
an Rijswijk-Deij"/>
<date year="2019" month="Oct"/>
</front>
<refcontent>IMC '19: Proceedings of the Internet Measurement Conference
, Pages 1-14</refcontent>
<seriesInfo name="DOI" value="10.1145/3355369.3355570"/>
</reference>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml3/reference.I-D.m
uks-dnsop-dns-thundering-herd.xml"/>
</references>
</references> </references>
<section anchor="acknowledgments" numbered="false">
<name>Acknowledgments</name>
<t>
The authors wish to thank <contact fullname="Mukund Sivaraman"/>,
<contact fullname="Petr Spacek"/>, <contact fullname="Peter van
Dijk"/>, <contact fullname="Tim Wicinksi"/>, <contact fullname="Joe
Abley"/>, <contact fullname="Evan Hunt"/>, <contact fullname="Barry
Leiba"/>, <contact fullname="Lucas Pardue"/>, <contact fullname="Paul
Wouters"/>, and other members of the DNSOP Working Group for their
feedback and contributions.
</t>
</section>
</back> </back>
</rfc> </rfc>
 End of changes. 114 change blocks. 
701 lines changed or deleted 468 lines changed or added

This html diff was produced by rfcdiff 1.48.