rfc9309xml2.original.xml | rfc9309.xml | |||
---|---|---|---|---|
<?xml version="1.0" encoding="US-ASCII"?> | <?xml version="1.0" encoding="UTF-8"?> | |||
<!DOCTYPE rfc SYSTEM "rfc2629.dtd" [ | <!DOCTYPE rfc [ | |||
<!ENTITY RFC1945 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | <!ENTITY nbsp " "> | |||
ference.RFC.1945.xml"> | <!ENTITY zwsp "​"> | |||
<!ENTITY RFC2046 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | <!ENTITY nbhy "‑"> | |||
ference.RFC.2046.xml"> | <!ENTITY wj "⁠"> | |||
<!ENTITY RFC2119 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | ||||
ference.RFC.2119.xml"> | ||||
<!ENTITY RFC3629 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | ||||
ference.RFC.3629.xml"> | ||||
<!ENTITY RFC3986 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | ||||
ference.RFC.3986.xml"> | ||||
<!ENTITY RFC5234 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | ||||
ference.RFC.5234.xml"> | ||||
<!ENTITY RFC8174 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | ||||
ference.RFC.8174.xml"> | ||||
<!ENTITY RFC8288 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | ||||
ference.RFC.8288.xml"> | ||||
<!ENTITY RFC9110 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | ||||
ference.RFC.9110.xml"> | ||||
<!ENTITY RFC9111 PUBLIC "" "http://xml2rfc.tools.ietf.org/public/rfc/bibxml/re | ||||
ference.RFC.9111.xml"> | ||||
]> | ]> | |||
<rfc ipr="trust200902" category="std" docName="draft-koster-rep-12" > | <rfc xmlns:xi="http://www.w3.org/2001/XInclude" ipr="trust200902" docName="draft | |||
-koster-rep-12" number="9309" obsoletes="" updates="" submissionType="IETF" cate | ||||
<?xml-stylesheet type="text/xsl" href="rfc2629.xslt" ?> | gory="std" consensus="true" xml:lang="en" tocInclude="true" tocDepth="4" symRefs | |||
="true" sortRefs="true" version="3"> | ||||
<?rfc toc="yes" ?> | ||||
<?rfc tocdepth="4" ?> | ||||
<?rfc symrefs="yes" ?> | ||||
<?rfc sortrefs="yes"?> | ||||
<?rfc compact="yes" ?> | ||||
<?rfc subcompact="no"?> | ||||
<front> | <!-- xml2rfc v2v3 conversion 3.13.0 --> | |||
<title abbrev="REP">Robots Exclusion Protocol</title> | ||||
<author initials="M." surname="Koster" fullname="Martijn Koster" role="edito | <front> | |||
r"> | <title abbrev="Robots Exclusion Protocol (REP)">Robots Exclusion Protocol</t | |||
<organization>Stalworthy Computing, Ltd.</organization> | itle> | |||
<seriesInfo name="RFC" value="9309"/> | ||||
<author initials="M." surname="Koster" fullname="Martijn Koster"> | ||||
<address> | <address> | |||
<postal> | <postal> | |||
<extaddr>Stalworthy Manor Farm</extaddr> | ||||
<street>Suton Lane</street> | <street>Suton Lane</street> | |||
<city>Wymondham, Norfolk</city> | <city>Wymondham, Norfolk</city> | |||
<code>NR18 9JG</code> | <code>NR18 9JG</code> | |||
<country>United Kingdom</country> | <country>United Kingdom</country> | |||
</postal> | </postal> | |||
<email>m.koster@greenhills.co.uk</email> | <email>m.koster@greenhills.co.uk</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<author initials="G." surname="Illyes" fullname="Gary Illyes" role="editor"> | <author initials="G." surname="Illyes" fullname="Gary Illyes"> | |||
<organization>Google LLC.</organization> | <organization>Google LLC</organization> | |||
<address> | <address> | |||
<postal> | <postal> | |||
<street>Brandschenkestrasse 110</street> | <street>Brandschenkestrasse 110</street> | |||
<city>Zurich</city> | <city>Zürich</city> | |||
<code>8002</code> | <code>8002</code> | |||
<country>Switzerland</country> | <country>Switzerland</country> | |||
</postal> | </postal> | |||
<email>garyillyes@google.com</email> | <email>garyillyes@google.com</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<author initials="H." surname="Zeller" fullname="Henner Zeller" role="editor | <author initials="H." surname="Zeller" fullname="Henner Zeller"> | |||
"> | <organization>Google LLC</organization> | |||
<organization>Google LLC.</organization> | ||||
<address> | <address> | |||
<postal> | <postal> | |||
<street>1600 Amphitheatre Pkwy</street> | <street>1600 Amphitheatre Pkwy</street> | |||
<city>Mountain View, CA</city> | <city>Mountain View</city> | |||
<region>CA</region> | ||||
<code>94043</code> | <code>94043</code> | |||
<country>USA</country> | <country>United States of America</country> | |||
</postal> | </postal> | |||
<email>henner@google.com</email> | <email>henner@google.com</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<author initials="L." surname="Sassman" fullname="Lizzi Sassman" role="edito | <author initials="L." surname="Sassman" fullname="Lizzi Sassman"> | |||
r"> | <organization>Google LLC</organization> | |||
<organization>Google LLC.</organization> | ||||
<address> | <address> | |||
<postal> | <postal> | |||
<street>Brandschenkestrasse 110</street> | <street>Brandschenkestrasse 110</street> | |||
<city>Zurich</city> | <city>Zürich</city> | |||
<code>8002</code> | <code>8002</code> | |||
<country>Switzerland</country> | <country>Switzerland</country> | |||
</postal> | </postal> | |||
<email>lizzi@google.com</email> | <email>lizzi@google.com</email> | |||
</address> | </address> | |||
</author> | </author> | |||
<date year="2022" month="September"/> | ||||
<date year="2022" month="July" day="06"/> | <keyword>robot</keyword> | |||
<keyword>crawler</keyword> | ||||
<area>General</area> | <keyword>robots.txt</keyword> | |||
<keyword>internet-drafts</keyword> | ||||
<abstract> | <abstract> | |||
<t> This document specifies and extends the "Robots Exclusion Protoco | <t> This document specifies and extends the "Robots Exclusion Protocol" | |||
l" | method originally defined by Martijn Koster in 1994 for service owners | |||
method originally defined by Martijn Koster in 1996 for service owners | ||||
to control how content served by their services may be accessed, if at | to control how content served by their services may be accessed, if at | |||
all, by automatic clients known as crawlers. Specifically, it adds | all, by automatic clients known as crawlers. Specifically, it adds | |||
definition language for the protocol and instructions for handling | definition language for the protocol, instructions for handling | |||
errors and caching. </t> | errors, and instructions for caching. </t> | |||
</abstract> | </abstract> | |||
</front> | </front> | |||
<middle> | <middle> | |||
<section anchor="introduction" title="Introduction"> | <section anchor="introduction" numbered="true" toc="default"> | |||
<name>Introduction</name> | ||||
<t> This document applies to services that provide resources that clients | <t> This document applies to services that provide resources that clients | |||
can access through URIs as defined in <xref target="RFC3986"/>. For ex ample, | can access through URIs as defined in <xref target="RFC3986" format="d efault"/>. For example, | |||
in the context of HTTP, a browser is a client that displays the conten t of a | in the context of HTTP, a browser is a client that displays the conten t of a | |||
web page. </t> | web page. </t> | |||
<t> Crawlers are automated clients. Search engines, for instance, have cra | ||||
<t> Crawlers are automated clients. Search engines for instance have crawl | wlers to | |||
ers to | ||||
recursively traverse links for indexing as defined in | recursively traverse links for indexing as defined in | |||
<xref target="RFC8288"/>. </t> | <xref target="RFC8288" format="default"/>. </t> | |||
<t> It may be inconvenient for service owners if crawlers visit the entire ty of | <t> It may be inconvenient for service owners if crawlers visit the entire ty of | |||
their URI space. This document specifies the rules originally defined by | their URI space. This document specifies the rules originally defined by | |||
the "Robots Exclusion Protocol" <xref target="ROBOTSTXT"/> t hat crawlers | the "Robots Exclusion Protocol" <xref target="ROBOTSTXT" format="defau lt"/> that crawlers | |||
are requested to honor when accessing URIs. </t> | are requested to honor when accessing URIs. </t> | |||
<t> These rules are not a form of access authorization. </t> | <t> These rules are not a form of access authorization. </t> | |||
<section anchor="requirements-language" numbered="true" toc="default"> | ||||
<section anchor="requirements-language" title="Requirements Language"> | <name>Requirements Language</name> | |||
<t> The key words "<bcp14>MUST</bcp14>", | <t>The key words "<bcp14>MUST</bcp14>", "<bcp14>MUST NOT</bcp14>", | |||
"<bcp14>MUST NOT</bcp14>", "<bcp14>REQUIRED</bcp14>&q | "<bcp14>REQUIRED</bcp14>", "<bcp14>SHALL</bcp14>", | |||
uot;, | "<bcp14>SHALL NOT</bcp14>", "<bcp14>SHOULD</bcp14>", | |||
"<bcp14>SHALL</bcp14>", "<bcp14>SHALL NOT</bcp14>&quo | "<bcp14>SHOULD NOT</bcp14>", | |||
t;, | "<bcp14>RECOMMENDED</bcp14>", "<bcp14>NOT RECOMMENDED</bcp14>", | |||
"<bcp14>SHOULD</bcp14>", "<bcp14>SHOULD NOT</bcp14>&q | "<bcp14>MAY</bcp14>", and "<bcp14>OPTIONAL</bcp14>" in this document | |||
uot;, | are to be interpreted as described in BCP 14 | |||
"<bcp14>RECOMMENDED</bcp14>", | <xref target="RFC2119"/> <xref target="RFC8174"/> when, and only | |||
"<bcp14>NOT RECOMMENDED</bcp14>", "<bcp14>MAY</bcp14> | when, they appear in all capitals, as shown here.</t> | |||
", | ||||
and "<bcp14>OPTIONAL</bcp14>" in this document are to be | ||||
interpreted as described in | ||||
BCP 14 <xref target="RFC2119"/> <xref target="RFC8174"/> when, and o | ||||
nly | ||||
when, they appear in all capitals, as shown here. </t> | ||||
</section> | </section> | |||
</section> | </section> | |||
<section anchor="specification" title="Specification"> | <section anchor="specification" numbered="true" toc="default"> | |||
<section anchor="protocol-definition" title="Protocol Definition"> | <name>Specification</name> | |||
<section anchor="protocol-definition" numbered="true" toc="default"> | ||||
<name>Protocol Definition</name> | ||||
<t> The protocol language consists of rule(s) and group(s) that the serv ice | <t> The protocol language consists of rule(s) and group(s) that the serv ice | |||
makes available in a file named 'robots.txt' as described in | makes available in a file named "robots.txt" as described in | |||
<xref target="access-method" />: </t> | <xref target="access-method" format="default"/>: </t> | |||
<t> | <dl spacing="normal"> | |||
<list style="symbols"> | <dt> Rule:</dt><dd> A line with a key-value pair that defines how a | |||
<t> Rule: A line with a key-value pair that defines how a | ||||
crawler may access URIs. See | crawler may access URIs. See | |||
<xref target="the-allow-and-disallow-lines" />. </t> | <xref target="the-allow-and-disallow-lines" format="default"/>. | |||
<t> Group: One or more user-agent lines that is followed by | </dd> | |||
<dt> Group:</dt><dd> One or more user-agent lines that are followed by | ||||
one or more rules. The group is terminated by a user-agent line | one or more rules. The group is terminated by a user-agent line | |||
or end of file. See <xref target="the-user-agent-line" />. | or end of file. See <xref target="the-user-agent-line" format="d efault"/>. | |||
The last group may have no rules, which means it implicitly | The last group may have no rules, which means it implicitly | |||
allows everything. </t> | allows everything. </dd> | |||
</list> </t> | </dl> | |||
</section> | </section> | |||
<section anchor="formal-syntax" title="Formal Syntax"> | <section anchor="formal-syntax" numbered="true" toc="default"> | |||
<name>Formal Syntax</name> | ||||
<t> Below is an Augmented Backus-Naur Form (ABNF) description, as descri bed | <t> Below is an Augmented Backus-Naur Form (ABNF) description, as descri bed | |||
in <xref target="RFC5234"/>. </t> | in <xref target="RFC5234" format="default"/>. </t> | |||
<sourcecode name="" type="abnf"><![CDATA[ | ||||
<figure><artwork> | robotstxt = *(group / emptyline) | |||
<![CDATA[ | group = startgroupline ; We start with a user-agent | |||
robotstxt = *(group / emptyline) | ; line | |||
group = startgroupline ; We start with a user-agent | *(startgroupline / emptyline) ; ... and possibly more | |||
*(startgroupline / emptyline) ; ... and possibly more | ; user-agent lines | |||
; user-agents | *(rule / emptyline) ; followed by rules relevant | |||
*(rule / emptyline) ; followed by rules relevant | ; for the preceding | |||
; for UAs | ; user-agent lines | |||
startgroupline = *WS "user-agent" *WS ":" *WS product-token EOL | startgroupline = *WS "user-agent" *WS ":" *WS product-token EOL | |||
rule = *WS ("allow" / "disallow") *WS ":" | rule = *WS ("allow" / "disallow") *WS ":" | |||
*WS (path-pattern / empty-pattern) EOL | *WS (path-pattern / empty-pattern) EOL | |||
; parser implementors: define additional lines you need (for | ; parser implementors: define additional lines you need (for | |||
; example, sitemaps). | ; example, Sitemaps). | |||
product-token = identifier / "*" | product-token = identifier / "*" | |||
path-pattern = "/" *UTF8-char-noctl ; valid URI path pattern | path-pattern = "/" *UTF8-char-noctl ; valid URI path pattern | |||
empty-pattern = *WS | empty-pattern = *WS | |||
identifier = 1*(%x2D / %x41-5A / %x5F / %x61-7A) | identifier = 1*(%x2D / %x41-5A / %x5F / %x61-7A) | |||
comment = "#" *(UTF8-char-noctl / WS / "#") | comment = "#" *(UTF8-char-noctl / WS / "#") | |||
emptyline = EOL | emptyline = EOL | |||
EOL = *WS [comment] NL ; end-of-line may have | EOL = *WS [comment] NL ; end-of-line may have | |||
; optional trailing comment | ; optional trailing comment | |||
NL = %x0D / %x0A / %x0D.0A | NL = %x0D / %x0A / %x0D.0A | |||
WS = %x20 / %x09 | WS = %x20 / %x09 | |||
; UTF8 derived from RFC3629, but excluding control characters | ; UTF8 derived from RFC 3629, but excluding control characters | |||
UTF8-char-noctl = UTF8-1-noctl / UTF8-2 / UTF8-3 / UTF8-4 | UTF8-char-noctl = UTF8-1-noctl / UTF8-2 / UTF8-3 / UTF8-4 | |||
UTF8-1-noctl = %x21 / %x22 / %x24-7F ; excluding control, space, '#' | UTF8-1-noctl = %x21 / %x22 / %x24-7F ; excluding control, space, "#" | |||
UTF8-2 = %xC2-DF UTF8-tail | UTF8-2 = %xC2-DF UTF8-tail | |||
UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2UTF8-tail / | UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2UTF8-tail / | |||
%xED %x80-9F UTF8-tail / %xEE-EF 2UTF8-tail | %xED %x80-9F UTF8-tail / %xEE-EF 2UTF8-tail | |||
UTF8-4 = %xF0 %x90-BF 2UTF8-tail / %xF1-F3 3UTF8-tail / | UTF8-4 = %xF0 %x90-BF 2UTF8-tail / %xF1-F3 3UTF8-tail / | |||
%xF4 %x80-8F 2UTF8-tail | %xF4 %x80-8F 2UTF8-tail | |||
UTF8-tail = %x80-BF | UTF8-tail = %x80-BF | |||
]]> | ]]></sourcecode> | |||
</artwork></figure> | <section anchor="the-user-agent-line" numbered="true" toc="default"> | |||
<section anchor="the-user-agent-line" title="The User-Agent Line"> | <name>The User-Agent Line</name> | |||
<t> Crawlers set their own name, which is called a product token, to f ind | <t> Crawlers set their own name, which is called a product token, to f ind | |||
relevant groups. The product token <bcp14>MUST</bcp14> contain onl y | relevant groups. The product token <bcp14>MUST</bcp14> contain onl y | |||
upper and lowercase letters ("a-z" and "A-Z"), | uppercase and lowercase letters ("a-z" and "A-Z"), | |||
underscores ("_"), and hyphens ("-"). | underscores ("_"), and hyphens ("-"). | |||
The product token <bcp14>SHOULD</bcp14> | The product token <bcp14>SHOULD</bcp14> | |||
be a substring of the identification string that the crawler sends to | be a substring of the identification string that the crawler sends to | |||
the service (for example, in the case of HTTP, the product token | the service. For example, in the case of HTTP | |||
<bcp14>SHOULD</bcp14> be a substring in the user-agent header). | <xref target="RFC9110" format="default"/>, the product token | |||
<bcp14>SHOULD</bcp14> be a substring in the User-Agent header. | ||||
The identification string <bcp14>SHOULD</bcp14> describe the purpo se of | The identification string <bcp14>SHOULD</bcp14> describe the purpo se of | |||
the crawler. Here's an example of a user-agent HTTP request he ader | the crawler. Here's an example of a User-Agent HTTP request header | |||
with a link pointing to a page describing the purpose of the | with a link pointing to a page describing the purpose of the | |||
ExampleBot crawler, which appears as a substring in the user-agent HTTP | ExampleBot crawler, which appears as a substring in the User-Agent HTTP | |||
header and as a product token in the robots.txt user-agent line: < /t> | header and as a product token in the robots.txt user-agent line: < /t> | |||
<texttable title="Example of a user-agent HTTP header and | <figure anchor="fig-1"> | |||
robots.txt user-agent line for the ExampleBot produc | <name>Example of a User-Agent HTTP header and | |||
t token. | robots.txt user-agent line for the ExampleBot product token</n | |||
Note that the product token (ExampleBot) is a substr | ame> | |||
ing of the | <artwork name="" type="" align="center" alt=""><![CDATA[ | |||
user-agent HTTP header"> | +==========================================+========================+ | |||
<ttcol align="left">user-agent HTTP header</ttcol> | | User-Agent HTTP header | robots.txt user-agent | | |||
<ttcol align="left">robots.txt user-agent line</ttcol> | | | line | | |||
<c>user-agent: Mozilla/5.0 (compatible; ExampleBot/0.1; https://www. | +==========================================+========================+ | |||
example.com/bot.html)</c> | | User-Agent: Mozilla/5.0 (compatible; | user-agent: ExampleBot | | |||
<c>user-agent: ExampleBot</c> | | ExampleBot/0.1; | | | |||
</texttable> | | https://www.example.com/bot.html) | | | |||
+------------------------------------------+------------------------+ | ||||
]]></artwork> | ||||
</figure> | ||||
<t> Note that the product token (ExampleBot) is a substring of | ||||
the User-Agent HTTP header.</t> | ||||
<t> Crawlers <bcp14>MUST</bcp14> use case-insensitive matching | <t> Crawlers <bcp14>MUST</bcp14> use case-insensitive matching | |||
to find the group that matches the product token, and then | to find the group that matches the product token and then | |||
obey the rules of the group. If there is more than one | obey the rules of the group. If there is more than one | |||
group matching the user-agent, the matching groups' rules | group matching the user-agent, the matching groups' rules | |||
<bcp14>MUST</bcp14> be combined into one group and parsed | <bcp14>MUST</bcp14> be combined into one group and parsed | |||
according to | according to | |||
<xref target="the-allow-and-disallow-lines" />. </t> | <xref target="the-allow-and-disallow-lines" format="default"/>.</t | |||
> | ||||
<texttable title="Example of how to merge two robots.txt | ||||
groups that match the same product token"> | ||||
<ttcol align="left">Two groups that match the same product token exa | ||||
ctly</ttcol> | ||||
<ttcol align="left">Merged group</ttcol> | ||||
<c>user-agent: ExampleBot<br /> | ||||
disallow: /foo<br /> | ||||
disallow: /bar<br /> | ||||
<br /> | ||||
user-agent: ExampleBot<br /> | ||||
disallow: /baz | ||||
</c> | ||||
<c>user-agent: ExampleBot<br /> | ||||
disallow: /foo<br /> | ||||
disallow: /bar<br /> | ||||
disallow: /baz</c> | ||||
</texttable> | ||||
<figure anchor="fig-2"> | ||||
<name>Example of how to merge two robots.txt | ||||
groups that match the same product token</name> | ||||
<artwork name="" type="" align="center" alt=""><![CDATA[ | ||||
+========================================+========================+ | ||||
| Two groups that match the same product | Merged group | | ||||
| token exactly | | | ||||
+========================================+========================+ | ||||
| user-agent: ExampleBot | user-agent: ExampleBot | | ||||
| disallow: /foo | disallow: /foo | | ||||
| disallow: /bar | disallow: /bar | | ||||
| | disallow: /baz | | ||||
| user-agent: ExampleBot | | | ||||
| disallow: /baz | | | ||||
+----------------------------------------+------------------------+ | ||||
]]></artwork> | ||||
</figure> | ||||
<t> If no matching group exists, crawlers <bcp14>MUST</bcp14> obey the group | <t> If no matching group exists, crawlers <bcp14>MUST</bcp14> obey the group | |||
with a user-agent line with the "*" value, if present. </t> | with a user-agent line with the "*" value, if present. </t> | |||
<figure anchor="fig-3"> | ||||
<texttable title="Example of no matching groups other than the '*' | <name>Example of no matching groups other than the "*" | |||
for the ExampleBot product token"> | for the ExampleBot product token</name> | |||
<ttcol align="left">Two groups that don't explicitly match ExampleBo | <artwork name="" type="" align="center" alt=""><![CDATA[ | |||
t</ttcol> | +==================================+======================+ | |||
<ttcol align="left">Applicable group for ExampleBot</ttcol> | | Two groups that don't explicitly | Applicable group for | | |||
<c>user-agent: *<br /> | | match ExampleBot | ExampleBot | | |||
disallow: /foo<br /> | +==================================+======================+ | |||
disallow: /bar<br /> | | user-agent: * | user-agent: * | | |||
<br /> | | disallow: /foo | disallow: /foo | | |||
user-agent: BazBot<br /> | | disallow: /bar | disallow: /bar | | |||
disallow: /baz | | | | | |||
</c> | | user-agent: BazBot | | | |||
<c>user-agent: *<br /> | | disallow: /baz | | | |||
disallow: /foo<br /> | +----------------------------------+----------------------+ | |||
disallow: /bar</c> | ]]></artwork> | |||
</texttable> | </figure> | |||
<t> If no group matches the product token and there is no group with a user-agent | <t> If no group matches the product token and there is no group with a user-agent | |||
line with the "*" value, or no groups are present at all, no | line with the "*" value, or no groups are present at all, no | |||
rules apply. </t> | rules apply. </t> | |||
</section> | </section> | |||
<section anchor="the-allow-and-disallow-lines" title="The Allow and Disa | <section anchor="the-allow-and-disallow-lines" numbered="true" toc="defa | |||
llow Lines"> | ult"> | |||
<name>The "Allow" and "Disallow" Lines</name> | ||||
<t> These lines indicate whether accessing a URI that matches the | <t> These lines indicate whether accessing a URI that matches the | |||
corresponding path is allowed or disallowed. </t> | corresponding path is allowed or disallowed. </t> | |||
<t> To evaluate if access to a URI is allowed, a crawler <bcp14>MUST</ bcp14> | <t> To evaluate if access to a URI is allowed, a crawler <bcp14>MUST</ bcp14> | |||
match the paths in allow and disallow rules against the URI. | match the paths in "allow" and "disallow" rules against the URI. | |||
The matching <bcp14>SHOULD</bcp14> be case sensitive. The matching | The matching <bcp14>SHOULD</bcp14> be case sensitive. The matching | |||
<bcp14>MUST</bcp14> start with the first octet of the path. The mo st | <bcp14>MUST</bcp14> start with the first octet of the path. The mo st | |||
specific match found <bcp14>MUST</bcp14> be used. The most specifi c | specific match found <bcp14>MUST</bcp14> be used. The most specifi c | |||
match is the match that has the most octets. Duplicate rules in a | match is the match that has the most octets. Duplicate rules in a | |||
group <bcp14>MAY</bcp14> be deduplicated. If an allow and disallow | group <bcp14>MAY</bcp14> be deduplicated. If an "allow" rule and a | |||
rule are equivalent, then the allow rule <bcp14>SHOULD</bcp14> be | "disallow" | |||
used. If no | rule are equivalent, then the "allow" rule <bcp14>SHOULD</bcp14> b | |||
match is found amongst the rules in a group for a matching user-ag | e used. If no | |||
ent, | match is found amongst the rules in a group for a matching user-ag | |||
ent | ||||
or there are no rules in the group, the URI is allowed. The | or there are no rules in the group, the URI is allowed. The | |||
/robots.txt URI is implicitly allowed. </t> | /robots.txt URI is implicitly allowed. </t> | |||
<t> Octets in the URI and robots.txt paths outside the range of the | <t> Octets in the URI and robots.txt paths outside the range of the | |||
US-ASCII coded character set, and those in the reserved range defi | ASCII coded character set, and those in the reserved range defined | |||
ned | by <xref target="RFC3986" format="default"/>, <bcp14>MUST</bcp14> | |||
by <xref target="RFC3986"/>, <bcp14>MUST</bcp14> be percent-encode | be percent-encoded as | |||
d as | defined by <xref target="RFC3986" format="default"/> prior to comp | |||
defined by <xref target="RFC3986"></xref> prior to comparison. </t | arison. </t> | |||
> | <t> If a percent-encoded ASCII octet is encountered in the URI, it | |||
<t> If a percent-encoded US-ASCII octet is encountered in the URI, it | ||||
<bcp14>MUST</bcp14> be unencoded prior to comparison, unless it is a | <bcp14>MUST</bcp14> be unencoded prior to comparison, unless it is a | |||
reserved character in the URI as defined by <xref target="RFC3986" /> | reserved character in the URI as defined by <xref target="RFC3986" format="default"/> | |||
or the character is outside the unreserved character range. The ma tch | or the character is outside the unreserved character range. The ma tch | |||
evaluates positively if and only if the end of the path from the r ule | evaluates positively if and only if the end of the path from the r ule | |||
is reached before a difference in octets is encountered. </t> | is reached before a difference in octets is encountered. </t> | |||
<t> For example: </t> | <t> For example: </t> | |||
<texttable title="Examples of matching percent-encoded URI components" | <figure anchor="fig-4"> | |||
> | <name>Examples of matching percent-encoded URI components</name> | |||
<ttcol align='left'>Path</ttcol> | <artwork name="" type="" align="center" alt=""><![CDATA[ | |||
<ttcol align='left'>Encoded Path</ttcol> | +==================+=======================+=======================+ | |||
<ttcol align='left'>Path to Match</ttcol> | | Path | Encoded Path | Path to Match | | |||
<c>/foo/bar?baz=quz</c> | +==================+=======================+=======================+ | |||
<c>/foo/bar?baz=quz</c> | | /foo/bar?baz=quz | /foo/bar?baz=quz | /foo/bar?baz=quz | | |||
<c>/foo/bar?baz=quz</c> | +------------------+-----------------------+-----------------------+ | |||
<c>/foo/bar?baz=http<br />://foo.bar</c> | | /foo/bar?baz= | /foo/bar?baz= | /foo/bar?baz= | | |||
<c>/foo/bar?baz=http%3A<br />%2F%2Ffoo.bar</c> | | https://foo.bar | https%3A%2F%2Ffoo.bar | https%3A%2F%2Ffoo.bar | | |||
<c>/foo/bar?baz=http%3A<br />%2F%2Ffoo.bar</c> | +------------------+-----------------------+-----------------------+ | |||
<c>/foo/bar/U+E38384</c> | | /foo/bar/ | /foo/bar/%E3%83%84 | /foo/bar/%E3%83%84 | | |||
<c>/foo/bar/%E3%83%84</c> | | U+E38384 | | | | |||
<c>/foo/bar/%E3%83%84</c> | +------------------+-----------------------+-----------------------+ | |||
<c>/foo/bar/%E3%83%84</c> | | /foo/ | /foo/bar/%E3%83%84 | /foo/bar/%E3%83%84 | | |||
<c>/foo/bar/%E3%83%84</c> | | bar/%E3%83%84 | | | | |||
<c>/foo/bar/%E3%83%84</c> | +------------------+-----------------------+-----------------------+ | |||
<c>/foo/bar/%62%61%7A</c> | | /foo/ | /foo/bar/%62%61%7A | /foo/bar/baz | | |||
<c>/foo/bar/%62%61%7A</c> | | bar/%62%61%7A | | | | |||
<c>/foo/bar/baz</c> | +------------------+-----------------------+-----------------------+ | |||
</texttable> | ]]></artwork> | |||
</figure> | ||||
<t> The crawler <bcp14>SHOULD</bcp14> ignore "disallow" and | <t> The crawler <bcp14>SHOULD</bcp14> ignore "disallow" and | |||
"allow" rules that are not in any group (for example, an | "allow" rules that are not in any group (for example, any | |||
y | ||||
rule that precedes the first user-agent line). </t> | rule that precedes the first user-agent line). </t> | |||
<t> Implementors <bcp14>MAY</bcp14> bridge encoding mismatches if they | ||||
<t> Implementers <bcp14>MAY</bcp14> bridge encoding mismatches if they | detect that the robots.txt file is not UTF-8 encoded. </t> | |||
detect that the robots.txt file is not UTF8 encoded. </t> | ||||
</section> | </section> | |||
<section anchor="special-characters" title="Special Characters"> | <section anchor="special-characters" numbered="true" toc="default"> | |||
<t> Crawlers <bcp14>MUST</bcp14> allow the following special character | <name>Special Characters</name> | |||
s: </t> | <t> Crawlers <bcp14>MUST</bcp14> support the following special charact | |||
ers: </t> | ||||
<texttable title="List of special characters in robots.txt files"> | <figure anchor="fig-5"> | |||
<ttcol align='left'>Character</ttcol> | <name>List of special characters in robots.txt files</name> | |||
<ttcol align='left'>Description</ttcol> | <artwork name="" type="" align="center" alt=""><![CDATA[ | |||
<ttcol align='left'>Example</ttcol> | +===========+===================+==============================+ | |||
<c>"#"</c> | | Character | Description | Example | | |||
<c>Designates an end of line comment.</c> | +===========+===================+==============================+ | |||
<c>"allow: / # comment in line"<br /><br />"# comment | | # | Designates a line | allow: / # comment in line | | |||
on its own line"</c> | | | comment. | | | |||
<c>"$"</c> | | | | # comment on its own line | | |||
<c>Designates the end of the match pattern.</c> | +-----------+-------------------+------------------------------+ | |||
<c>"allow: /this/path/exactly$"</c> | | $ | Designates the | allow: /this/path/exactly$ | | |||
<c>"*"</c> | | | end of the match | | | |||
<c>Designates 0 or more instances of any character.</c> | | | pattern. | | | |||
<c>"allow: /this/*/exactly"</c> | +-----------+-------------------+------------------------------+ | |||
</texttable> | | * | Designates 0 or | allow: /this/*/exactly | | |||
| | more instances of | | | ||||
| | any character. | | | ||||
+-----------+-------------------+------------------------------+ | ||||
]]></artwork> | ||||
</figure> | ||||
<t> If crawlers match special characters verbatim in the URI, crawlers | <t> If crawlers match special characters verbatim in the URI, crawlers | |||
<bcp14>SHOULD</bcp14> use "%" encoding. For example: </t | <bcp14>SHOULD</bcp14> use "%" encoding. For example: </t> | |||
> | <figure anchor="fig-6"> | |||
<name>Example of percent-encoding</name> | ||||
<texttable title="Example of percent-encoding"> | <artwork name="" type="" align="center" alt=""><![CDATA[ | |||
<ttcol align='left'>Percent-encoded Pattern</ttcol> | +============================+====================================+ | |||
<ttcol align='left'>URI</ttcol> | | Percent-encoded Pattern | URI | | |||
<c>/path/file-with-a-%2A.html</c> | +============================+====================================+ | |||
<c>https://www.example.com/path/file-with-a-*.html</c> | | /path/file-with-a-%2A.html | https://www.example.com/path/ | | |||
<c>/path/foo-%24</c> | | | file-with-a-*.html | | |||
<c>https://www.example.com/path/foo-$</c> | +----------------------------+------------------------------------+ | |||
</texttable> | | /path/foo-%24 | https://www.example.com/path/foo-$ | | |||
+----------------------------+------------------------------------+ | ||||
]]></artwork> | ||||
</figure> | ||||
</section> | </section> | |||
<section anchor="other-records" title="Other Records"> | <section anchor="other-records" numbered="true" toc="default"> | |||
<name>Other Records</name> | ||||
<t> Crawlers <bcp14>MAY</bcp14> interpret other records that are not | <t> Crawlers <bcp14>MAY</bcp14> interpret other records that are not | |||
part of the robots.txt protocol. For example, 'sitemap' | part of the robots.txt protocol -- for example, "Sitemaps" | |||
<xref target="SITEMAPS"/>. Crawlers MAY be lenient when | <xref target="SITEMAPS" format="default"/>. Crawlers <bcp14>MAY</b | |||
cp14> be lenient when | ||||
interpreting other records. For example, crawlers may accept | interpreting other records. For example, crawlers may accept | |||
common typos of the record. </t> | common misspellings of the record. </t> | |||
<t> Parsing of other records | <t> Parsing of other records | |||
<bcp14>MUST NOT</bcp14> interfere with the parsing of explicitly | <bcp14>MUST NOT</bcp14> interfere with the parsing of explicitly | |||
defined records in <xref target="specification" />. </t> | defined records in <xref target="specification" format="default"/> | |||
. | ||||
For example, a "Sitemaps" record <bcp14>MUST NOT</bcp14> terminate | ||||
a | ||||
group. </t> | ||||
</section> | </section> | |||
</section> | </section> | |||
<section anchor="access-method" title="Access Method"> | <section anchor="access-method" numbered="true" toc="default"> | |||
<t> The rules <bcp14>MUST</bcp14> be accessible in a file named | <name>Access Method</name> | |||
"/robots.txt" (all lower case) in the top level path of | <t> The rules <bcp14>MUST</bcp14> be accessible in a file named | |||
"/robots.txt" (all lowercase) in the top-level path of | ||||
the service. The file <bcp14>MUST</bcp14> be UTF-8 encoded (as | the service. The file <bcp14>MUST</bcp14> be UTF-8 encoded (as | |||
defined in <xref target="RFC3629"/>) and Internet Media Type | defined in <xref target="RFC3629" format="default"/>) and Internet Med | |||
"text/plain" | ia Type | |||
(as defined in <xref target="RFC2046"/>). </t> | "text/plain" | |||
<t> As per <xref target="RFC3986"/>, the URI of the robots.txt is: </t> | (as defined in <xref target="RFC2046" format="default"/>). </t> | |||
<t> "scheme:[//authority]/robots.txt" </t> | <t> As per <xref target="RFC3986" format="default"/>, the URI of the rob | |||
<t> For example, in the context of HTTP or FTP, the URI is: </t> | ots.txt file is: </t> | |||
<t> "scheme:[//authority]/robots.txt" </t> | ||||
<figure> | <t> For example, in the context of HTTP or FTP, the URI is: </t> | |||
<artwork><![CDATA[ | <artwork name="" type="" align="left" alt=""><![CDATA[ | |||
https://www.example.com/robots.txt | https://www.example.com/robots.txt | |||
ftp://ftp.example.com/robots.txt | ftp://ftp.example.com/robots.txt | |||
]]></artwork> | ]]></artwork> | |||
</figure> | <section anchor="access-results" numbered="true" toc="default"> | |||
<name>Access Results</name> | ||||
<section anchor="access-results" title="Access Results"> | <section anchor="successful-access" numbered="true" toc="default"> | |||
<section anchor="successful-access" title="Successful Access"> | <name>Successful Access</name> | |||
<t> If the crawler successfully downloads the robots.txt, the | <t> If the crawler successfully downloads the robots.txt file, the | |||
crawler <bcp14>MUST</bcp14> follow the parseable rules. </t> | crawler <bcp14>MUST</bcp14> follow the parseable rules. </t> | |||
</section> | </section> | |||
<section anchor="redirects" title="Redirects"> | <section anchor="redirects" numbered="true" toc="default"> | |||
<t> It's possible that a server responds to a robots.txt fetch | <name>Redirects</name> | |||
request with a redirect, such as HTTP 301 and HTTP 302 in | <t> It's possible that a server responds to a robots.txt fetch | |||
request with a redirect, such as HTTP 301 or HTTP 302 in the | ||||
case of HTTP. The crawlers <bcp14>SHOULD</bcp14> follow at | case of HTTP. The crawlers <bcp14>SHOULD</bcp14> follow at | |||
least five consecutive redirects, even across authorities | least five consecutive redirects, even across authorities | |||
(for example, hosts in case of HTTP), as defined in | (for example, hosts in the case of HTTP). </t> | |||
<xref target="RFC1945"/>. </t> | <t> If a robots.txt file is reached within five consecutive | |||
<t> If a robots.txt file is reached within five consecutive | ||||
redirects, the robots.txt file <bcp14>MUST</bcp14> be fetched, | redirects, the robots.txt file <bcp14>MUST</bcp14> be fetched, | |||
parsed, and its rules followed in the context of the initial | parsed, and its rules followed in the context of the initial | |||
authority. </t> | authority. </t> | |||
<t> If there are more than five consecutive redirects, crawlers | <t> If there are more than five consecutive redirects, crawlers | |||
<bcp14>MAY</bcp14> assume that the robots.txt is | <bcp14>MAY</bcp14> assume that the robots.txt file is | |||
unavailable. </t> | unavailable. </t> | |||
</section> | </section> | |||
<section anchor="unavailable-status" title="Unavailable Status"> | <section anchor="unavailable-status" numbered="true" toc="default"> | |||
<t> Unavailable means the crawler tries to fetch the robots.txt, | <name>"Unavailable" Status</name> | |||
and the server responds with unavailable status codes. For | <t> "Unavailable" means the crawler tries to fetch the robots.txt fi | |||
example, in the context of HTTP, unavailable status codes are | le | |||
and the server responds with status codes indicating that the reso | ||||
urce in question is unavailable. For | ||||
example, in the context of HTTP, such status codes are | ||||
in the 400-499 range. </t> | in the 400-499 range. </t> | |||
<t> If a server status code indicates that the robots.txt file is | ||||
<t> If a server status code indicates that the robots.txt file is | unavailable to the crawler, then the crawler <bcp14>MAY</bcp14> ac | |||
unavailable to the crawler, then the crawler MAY access any | cess any | |||
resources on the server. </t> | resources on the server. </t> | |||
</section> | </section> | |||
<section anchor="unreachable-status" title="Unreachable Status"> | <section anchor="unreachable-status" numbered="true" toc="default"> | |||
<t> If the robots.txt is unreachable due to server or network | <name>"Unreachable" Status</name> | |||
errors, this means the robots.txt is undefined and the crawler | <t> If the robots.txt file is unreachable due to server or network | |||
errors, this means the robots.txt file is undefined and the crawle | ||||
r | ||||
<bcp14>MUST</bcp14> assume complete disallow. For example, in | <bcp14>MUST</bcp14> assume complete disallow. For example, in | |||
the context of HTTP, an unreachable robots.txt has a response | the context of HTTP, server errors are identified by status codes | |||
code in the 500-599 range. </t> | in the 500-599 range. </t> | |||
<t> If the robots.txt is undefined for a reasonably long period of | <t> If the robots.txt file is undefined for a reasonably long period | |||
time (for example, 30 days), crawlers <bcp14>MAY</bcp14> assume | of | |||
the robots.txt is unavailable as defined in | time (for example, 30 days), crawlers <bcp14>MAY</bcp14> assume th | |||
<xref target="unavailable-status"/> or continue to use a cached | at | |||
the robots.txt file is unavailable as defined in | ||||
<xref target="unavailable-status" format="default"/> or continue t | ||||
o use a cached | ||||
copy. </t> | copy. </t> | |||
</section> | </section> | |||
<section anchor="parsing-errors" title="Parsing Errors"> | <section anchor="parsing-errors" numbered="true" toc="default"> | |||
<t> Crawlers <bcp14>MUST</bcp14> try to parse each line of the | <name>Parsing Errors</name> | |||
<t> Crawlers <bcp14>MUST</bcp14> try to parse each line of the | ||||
robots.txt file. Crawlers <bcp14>MUST</bcp14> use the parseable | robots.txt file. Crawlers <bcp14>MUST</bcp14> use the parseable | |||
rules. </t> | rules. </t> | |||
</section> | ||||
</section> | </section> | |||
</section> | </section> | |||
</section> | <section anchor="caching" numbered="true" toc="default"> | |||
<section anchor="caching" title="Caching"> | <name>Caching</name> | |||
<t> Crawlers <bcp14>MAY</bcp14> cache the fetched robots.txt file's | <t> Crawlers <bcp14>MAY</bcp14> cache the fetched robots.txt file's | |||
contents. Crawlers <bcp14>MAY</bcp14> use standard cache control as | contents. Crawlers <bcp14>MAY</bcp14> use standard cache control as | |||
defined in <xref target="RFC9111"/>. Crawlers | defined in <xref target="RFC9111" format="default"/>. Crawlers | |||
<bcp14>SHOULD NOT</bcp14> use the cached version for more than 24 | <bcp14>SHOULD NOT</bcp14> use the cached version for more than 24 | |||
hours, unless the robots.txt is unreachable. </t> | hours, unless the robots.txt file is unreachable. </t> | |||
</section> | </section> | |||
<section anchor="limits" title="Limits"> | <section anchor="limits" numbered="true" toc="default"> | |||
<t> Crawlers SHOULD impose a parsing limit to protect their systems; | <name>Limits</name> | |||
see <xref target="security"/>. The parsing limit MUST be at least | <t> Crawlers <bcp14>SHOULD</bcp14> impose a parsing limit to protect the | |||
500 kibibytes <xref target="KiB"/>. </t> | ir systems; | |||
see <xref target="security" format="default"/>. The parsing limit <bcp | ||||
14>MUST</bcp14> be at least | ||||
500 kibibytes <xref target="KiB" format="default"/>. </t> | ||||
</section> | </section> | |||
</section> | </section> | |||
<section anchor="security" title="Security Considerations"> | <section anchor="security" numbered="true" toc="default"> | |||
<t> The Robots Exclusion Protocol is not a substitute for more valid | <name>Security Considerations</name> | |||
<t> The Robots Exclusion Protocol is not a substitute for valid | ||||
content security measures. Listing paths in the robots.txt file | content security measures. Listing paths in the robots.txt file | |||
exposes them publicly and thus makes the paths discoverable. To | exposes them publicly and thus makes the paths discoverable. To | |||
control access to the URI paths in a robots.txt file, users of | control access to the URI paths in a robots.txt file, users of | |||
the protocol should employ a valid security measure relevant to | the protocol should employ a valid security measure relevant to | |||
the application layer on which the robots.txt file is served. | the application layer on which the robots.txt file is served -- | |||
For example, in case of HTTP, HTTP Authentication defined in | for example, in the case of HTTP, HTTP Authentication as defined in | |||
<xref target="RFC9110"/>. </t> | <xref target="RFC9110" format="default"/>. </t> | |||
<t> To protect against attacks against their system, implementors | <t> To protect against attacks against their system, implementors | |||
of robots.txt parsing and matching logic should take the | of robots.txt parsing and matching logic should take the | |||
following considerations into account: </t> | following considerations into account: </t> | |||
<t> | <dl spacing="normal"> | |||
<list style="symbols"> | <dt> Memory management:</dt><dd> <xref target="limits" format="default"/ | |||
<t> Memory management: <xref target="limits" /> defines the lower | > defines the lower | |||
limit of bytes that must be processed, which inherently also | limit of bytes that must be processed, which inherently also | |||
protects the parser from out of memory scenarios. </t> | protects the parser from out-of-memory scenarios. </dd> | |||
<t> Invalid characters: <xref target="formal-syntax" /> defines | <dt> Invalid characters:</dt><dd> <xref target="formal-syntax" format="d | |||
efault"/> defines | ||||
a set of characters that parsers and matchers can expect in | a set of characters that parsers and matchers can expect in | |||
robots.txt files. Out of bound characters should be rejected | robots.txt files. Out-of-bound characters should be rejected | |||
as invalid, which limits the available attack vectors that | as invalid, which limits the available attack vectors that | |||
attempt to compromise the system. </t> | attempt to compromise the system. </dd> | |||
<t> Untrusted content: Implementors should treat the content of | <dt> Untrusted content:</dt><dd> Implementors should treat the content o | |||
f | ||||
a robots.txt file as untrusted content, as defined by the | a robots.txt file as untrusted content, as defined by the | |||
specification of the application layer used. For example, | specification of the application layer used. For example, | |||
in the context of HTTP, implementors should follow the | in the context of HTTP, implementors should follow the | |||
security considerations section of | Security Considerations section of | |||
<xref target="RFC9110"/>. </t> | <xref target="RFC9110" format="default"/>. </dd> | |||
</list> | </dl> | |||
</t> | ||||
</section> | </section> | |||
<section anchor="IANA" title="IANA Considerations"> | <section anchor="IANA" numbered="true" toc="default"> | |||
<t> This document has no actions for IANA. </t> | <name>IANA Considerations</name> | |||
<t> This document has no IANA actions. </t> | ||||
</section> | </section> | |||
<section anchor="examples" title="Examples"> | <section anchor="examples" numbered="true" toc="default"> | |||
<section anchor="simple-example" title="Simple Example"> | <name>Examples</name> | |||
<section anchor="simple-example" numbered="true" toc="default"> | ||||
<name>Simple Example</name> | ||||
<t> The following example shows: </t> | <t> The following example shows: </t> | |||
<t> | <dl spacing="normal"> | |||
<list style="symbols"> | <dt> *:</dt><dd> A group that's relevant to all user agents that | |||
<t> *: A group that's relevant to all user-agents that | ||||
don't have an explicitly defined matching group. It allows | don't have an explicitly defined matching group. It allows | |||
access to the URLs with the /publications/ path prefix, and | access to the URLs with the /publications/ path prefix, and it | |||
restricts access to the URLs with the /example/ path prefix | restricts access to the URLs with the /example/ path prefix | |||
and to all URLs with .gif suffix. The * character designates | and to all URLs with a .gif suffix. The "*" character designates | |||
any character, including the otherwise required forward | any character, including the otherwise-required forward | |||
slash; see <xref target="formal-syntax" />. </t> | slash; see <xref target="formal-syntax" format="default"/>. </dd | |||
<t> foobot: A regular case. A single user-agent followed | > | |||
<dt> foobot:</dt><dd> A regular case. A single user agent followed | ||||
by rules. The crawler only has access to two URL path | by rules. The crawler only has access to two URL path | |||
prefixes on the site, /example/page.html and | prefixes on the site -- /example/page.html and | |||
/example/allowed.gif. The rules of the group are missing | /example/allowed.gif. The rules of the group are missing | |||
the optional whitespace character, which is acceptable as | the optional space character, which is acceptable as | |||
defined in <xref target="formal-syntax" />. </t> | defined in <xref target="formal-syntax" format="default"/>. </dd | |||
<t> barbot and bazbot: A group that's relevant for more | > | |||
than one user-agent. The crawlers are not allowed to access | <dt> barbot and bazbot:</dt><dd> A group that's relevant for more | |||
the URLs with the /example/page.html path prefix, but | than one user agent. The crawlers are not allowed to access | |||
the URLs with the /example/page.html path prefix but | ||||
otherwise have unrestricted access to the rest of the URLs | otherwise have unrestricted access to the rest of the URLs | |||
on the site. </t> | on the site. </dd> | |||
<t> quxbot: An empty group at end of the file. The crawler has | <dt> quxbot:</dt><dd> An empty group at the end of the file. The crawl | |||
unrestricted access to the URLs on the site. </t> | er has | |||
</list> | unrestricted access to the URLs on the site. </dd> | |||
</t> | </dl> | |||
<figure> | <artwork name="" type="" align="left" alt=""><![CDATA[ | |||
<artwork><![CDATA[ | User-Agent: * | |||
User-agent: * | ||||
Disallow: *.gif$ | Disallow: *.gif$ | |||
Disallow: /example/ | Disallow: /example/ | |||
Allow: /publications/ | Allow: /publications/ | |||
User-Agent: foobot | User-Agent: foobot | |||
Disallow:/ | Disallow:/ | |||
Allow:/example/page.html | Allow:/example/page.html | |||
Allow:/example/allowed.gif | Allow:/example/allowed.gif | |||
User-Agent: barbot | User-Agent: barbot | |||
User-Agent: bazbot | User-Agent: bazbot | |||
Disallow: /example/page.html | Disallow: /example/page.html | |||
User-Agent: quxbot | User-Agent: quxbot | |||
EOF | EOF | |||
]]></artwork> | ]]></artwork> | |||
</figure> | ||||
</section> | </section> | |||
<section anchor="longest-match" title="Longest Match"> | <section anchor="longest-match" numbered="true" toc="default"> | |||
<name>Longest Match</name> | ||||
<t> The following example shows that in the case of two rules, the | <t> The following example shows that in the case of two rules, the | |||
longest one is used for matching. In the following case, | longest one is used for matching. In the following case, | |||
/example/page/disallowed.gif <bcp14>MUST</bcp14> be used for | /example/page/disallowed.gif <bcp14>MUST</bcp14> be used for | |||
the URI example.com/example/page/disallow.gif. </t> | the URI example.com/example/page/disallow.gif. </t> | |||
<figure> | <artwork name="" type="" align="left" alt=""><![CDATA[ | |||
<artwork><![CDATA[ | ||||
User-Agent: foobot | User-Agent: foobot | |||
Allow: /example/page/ | Allow: /example/page/ | |||
Disallow: /example/page/disallowed.gif | Disallow: /example/page/disallowed.gif | |||
]]></artwork> | ]]></artwork> | |||
</figure> | ||||
</section> | </section> | |||
</section> | </section> | |||
</middle> | </middle> | |||
<back> | <back> | |||
<references title='Normative References'> | <references> | |||
&RFC1945; | <name>References</name> | |||
&RFC2046; | <references> | |||
&RFC2119; | <name>Normative References</name> | |||
&RFC3629; | <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.2 | |||
&RFC3986; | 046.xml"/> | |||
&RFC5234; | <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.2 | |||
&RFC8174; | 119.xml"/> | |||
&RFC8288; | <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.3 | |||
&RFC9110; | 629.xml"/> | |||
&RFC9111; | <xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.3 | |||
</references> | 986.xml"/> | |||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.5 | ||||
234.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.8 | ||||
174.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.8 | ||||
288.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.9 | ||||
110.xml"/> | ||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml/reference.RFC.9 | ||||
111.xml"/> | ||||
</references> | ||||
<references> | ||||
<name>Informative References</name> | ||||
<references title='Informative References'> | <reference anchor="ROBOTSTXT" target="https://www.robotstxt.org/"> | |||
<reference anchor="ROBOTSTXT" target="http://www.robotstxt.org/"> | <front> | |||
<front> | <title>The Web Robots Pages (including /robots.txt)</title> | |||
<title>Robots Exclusion Protocol</title> | <author> | |||
<author> | <organization/> | |||
<organization></organization> | </author> | |||
</author> | <date>2007</date> | |||
<date year="n.d."/> | </front> | |||
</front> | </reference> | |||
</reference> | <reference anchor="SITEMAPS" target="https://www.sitemaps.org/index.html | |||
<reference anchor="SITEMAPS" target="https://www.sitemaps.org/index.html"> | "> | |||
<front> | <front> | |||
<title>Sitemaps Protocol</title> | <title>What are Sitemaps? (Sitemap protocol)</title> | |||
<author> | <author> | |||
<organization></organization> | <organization/> | |||
</author> | </author> | |||
<date year="n.d."/> | <date>April 2020</date> | |||
</front> | </front> | |||
</reference> | </reference> | |||
<reference anchor="KiB" target="https://simple.wikipedia.org/wiki/Kibibyte | <reference anchor="KiB" target="https://simple.wikipedia.org/wiki/Kibiby | |||
"> | te"> | |||
<front> | <front> | |||
<title>Kibibyte - Simple English Wikipedia, the free encyclopedia</tit | <title>Kibibyte</title> | |||
le> | <author> | |||
<author> | <organization/> | |||
<organization></organization> | </author> | |||
</author> | <date day="17" month="September" year="2020"/> | |||
<date year="n.d."/> | </front> | |||
</front> | <refcontent>Simple English Wikipedia, the free encyclopedia</refconten | |||
</reference> | t> | |||
</reference> | ||||
</references> | ||||
</references> | </references> | |||
</back> | </back> | |||
</rfc> | </rfc> | |||
End of changes. 104 change blocks. | ||||
425 lines changed or deleted | 438 lines changed or added | |||
This html diff was produced by rfcdiff 1.48. |