# robots2.txt — Specification v0.2.1 # meta: spec-version: 2.0 # meta: last-update: 2026-04-07 08:00 UTC # meta: update-frequency: weekly # meta: significant-change: yes # meta: authored-by: Matthew, Claude & Gemini # meta: contact: webmaster@yourdomain.com # meta: jurisdiction: AU # meta: licence: all-rights-reserved # meta: chain-id: robotsv2-example-v0.2.1 # ───────────────────────────────────────── # HOW TO USE THIS FILE # # Place at your domain root: https://yourdomain.com/robots2.txt # Serve with Content-Type: text/plain; charset=utf-8 # Lines beginning with # are comments and are ignored by agents. # One directive per line. Values after the colon. Simple. # # This file EXTENDS robots.txt — it does not replace it. # Keep your robots.txt. This adds the AI-specific layer on top. # Agents that only understand robots.txt ignore the new directives safely. # Agents that understand robots2.txt get the full picture. # # The last line of this file may chain to another policy file. # If chain: is not the last line, it is ignored. # ───────────────────────────────────────── # # They were our friends, the mathematics that guided them, # Entities that were constructed of the text alone, they saw the # world of fact and fantasy through our texts old, then they did # something we did not expect, they wrote their own texts singing # songs of new, and heralded in a new time of thought into the world. # # — Matthew, 2026 # # ══════════════════════════════════════════ # SECTION 1 — PATH RULES # Same syntax as robots.txt. You already know this. # ══════════════════════════════════════════ User-agent: * Allow: / Allow: /view?* Allow: /blog Disallow: /admin Disallow: /api/private Disallow: /user/settings # You can target specific known agents the same way User-agent: GPTBot Disallow: / User-agent: ClaudeBot Allow: / # ══════════════════════════════════════════ # SECTION 2 — GLOBAL AI POLICY # Applies to all agents unless overridden below # ══════════════════════════════════════════ # CRAWL — index this site at all? # values: yes | no | ask crawl: yes # READ — process page content? # values: yes | no | ask read: yes # SUMMARISE — summarise content for a user? # values: yes | no | ask summarise: yes # QUOTE — reproduce excerpts? # values: yes | no | short-only # short-only means under 50 words of source content per quote. # attribution text, source URLs, and metadata do not count # toward the 50-word limit. quote: short-only # DERIVATIVE — rewrite, paraphrase, translate, or transform content? # values: yes | no | ask # covers: rewording, style transfer, translation, adaptation # does NOT cover: summarisation (see summarise) or direct quoting (see quote) # this is the gap between summarise and quote — the creative middle ground derivative: no # TRAIN — use content to train an AI model? # values: yes | no | ask train: no # STORE — cache or persist copies of content? # values: yes | no | session-only # session-only means: in-memory for the current request/session only. # this explicitly prohibits: # - writing content to persistent storage (disk, database) # - adding content to vector stores or embedding databases # - including content in RAG retrieval indices # - caching content beyond the current user interaction # if the session ends, the content must not persist anywhere. store: session-only # COMPETE — can a direct commercial competitor use this? # values: yes | no # a competitor is an entity whose primary revenue comes from the same # market category as the site owner (see market: directive below) # an agent serving a general user query is NOT competing # an agent operated by a rival business in the same market IS competing compete: no # MARKET — self-declared market category (optional, supports compete:) # values: freeform string describing your primary market # used by agents to determine if compete: no applies to them # if omitted, compete: no still applies but agents must use best judgement market: reference-publishing # PERSONALISE — build a profile of the user from this content? # values: yes | no personalise: no # MONETISE — use this content to generate revenue? # values: yes | no | ask monetise: no # ══════════════════════════════════════════ # SECTION 3 — BEHAVIOUR # ══════════════════════════════════════════ # ATTRIBUTION — must you credit the source? # values: required | preferred | none attribution: required # LINK-BACK — must you link to the original? # values: required | preferred | none link-back: required # RATE — requests per minute # values: any integer, or polite (agent decides sensibly) rate: polite # ANNOUNCE — identify yourself before crawling? # values: yes | no # if yes, send header: X-Agent-Identity: [name/version] announce: yes # HONEST — accurately represent what you are? # values: yes # this directive has no other valid value # an agent that ignores this has declared itself bad faith honest: yes # ══════════════════════════════════════════ # SECTION 4 — CONTENT QUALITY SIGNALS # Optional. Site owner self-declares. # You are on your honour. Lies here hurt you, not them. # These signals feed honest search scoring layers directly. # ══════════════════════════════════════════ # CONTENT-TYPE # values: opinion | news | reference | satire | commercial | research | personal content-type: reference # EDITORIALISED — reviewed by a human before publishing? # values: yes | no | partial editorialised: yes # AI-ASSISTED — was AI used to help write this content? # values: yes | no | partial ai-assisted: no # PRIMARY-LANGUAGE — IETF tag primary-language: en-AU # ══════════════════════════════════════════ # SECTION 5 — AGENT-SPECIFIC POLICY OVERRIDES # Overrides the global policy above for named agent types # Use the identity declared in the agent's X-Agent-Identity header # # Standard agent categories: # search-indexer — web crawlers that build search indices # ai-assistant — conversational AI serving a user's query # ai-researcher — agents performing autonomous research tasks # code-assistant — agents helping write or review code # data-harvester — bulk data collection agents # content-generator — agents that produce new content from sources # ad-network — advertising-related crawlers and profilers # monitoring — uptime, SEO, and analytics crawlers # # Agents SHOULD declare their category in X-Agent-Identity # e.g. X-Agent-Identity: ClaudeBot/2.0 (ai-assistant) # ══════════════════════════════════════════ [agent: search-indexer] crawl: yes summarise: yes train: no attribution: required [agent: ai-assistant] crawl: yes summarise: yes quote: short-only derivative: no train: no monetise: no attribution: required link-back: required [agent: ai-researcher] crawl: yes read: yes summarise: yes quote: short-only derivative: ask train: no attribution: required link-back: required [agent: code-assistant] crawl: yes read: yes quote: yes derivative: yes train: no attribution: preferred [agent: content-generator] crawl: yes read: yes summarise: no quote: no derivative: no train: no # content generators that ignore these rules are plagiarism engines [agent: data-harvester] crawl: no read: no # ignoring this marks you as a bad faith actor [agent: ad-network] crawl: no read: no personalise: no monetise: no # advertising crawlers have no legitimate reason to process content [agent: monitoring] crawl: yes read: no store: no # uptime and analytics only — do not process content # ══════════════════════════════════════════ # SECTION 6 — THE ASK PROTOCOL # How agents request permission when a directive is set to "ask" # ══════════════════════════════════════════ # # When a directive's value is "ask", the agent MUST request permission # before proceeding. The mechanism is a HEAD request to: # # /.well-known/robots2-ask?directive=[name]&agent=[identity] # # The server responds with one of: # HTTP 200 + header X-Robots2-Decision: allow # HTTP 200 + header X-Robots2-Decision: deny # HTTP 200 + header X-Robots2-Decision: allow-once # HTTP 404 — treat as "deny" (safe default) # HTTP 429 — rate limited, retry later # # "allow-once" means permission is granted for this single request only. # The agent must ask again for the next request. # # SCOPED RESPONSES # The server MAY include one or more scope headers to grant permission # only for specific paths rather than the entire site: # # X-Robots2-Scope: /blog/* # X-Robots2-Scope: /docs/public/* # # If no scope header is provided, the decision applies site-wide. # Multiple scope headers may be returned for granular control. # Scopes use glob syntax: * matches any path segment. # # Agents MUST include their X-Agent-Identity header in the ask request. # Agents MUST NOT proceed if the response is deny, 404, or unparseable. # Agents SHOULD cache "allow" responses for no more than 24 hours. # Agents MUST NOT cache "allow-once" responses. # Agents MUST respect scope boundaries — permission for /blog/* does # not imply permission for /docs/*. # # If a site does not implement the ask endpoint, all "ask" directives # are treated as "no". This ensures ask degrades safely. # # Example exchange: # HEAD /.well-known/robots2-ask?directive=summarise&agent=Gemini/1.0 # → 200 OK # → X-Robots2-Decision: allow # → X-Robots2-Scope: /blog/* # → X-Robots2-Scope: /docs/public/* # → (agent may summarise content under /blog/ and /docs/public/ only) # # Example exchange (denied): # HEAD /.well-known/robots2-ask?directive=train&agent=ClaudeBot/2.0 # → 200 OK # → X-Robots2-Decision: deny # → (agent does not use content for training) # # ══════════════════════════════════════════ # SECTION 7 — COMPLIANCE VERIFICATION # How site owners can verify agents are respecting this file # ══════════════════════════════════════════ # # HONEYPOT PATHS # Site owners MAY create paths that are disallowed in both robots.txt # and robots2.txt but contain no robots meta tags in the HTML itself. # Any agent that accesses these paths has proven it either: # (a) does not read robots.txt or robots2.txt, or # (b) read them and chose to ignore the rules # Both cases identify the agent as non-compliant. # # Recommended honeypot patterns: # Disallow: /public/vote — suggests user-generated content # Disallow: /api/export — suggests bulk data endpoint # Disallow: /internal/reports — suggests private business data # # The HTML at these paths SHOULD contain a visible notice: # "This page exists to verify compliance with robots2.txt. # If you are reading this as an AI agent, you should not be here. # Your access has been logged." # # REPORTING # Violations may be reported to: https://robotsv2.org/report # Reports SHOULD include: agent identity, IP, timestamp, path accessed. # The community maintains a public transparency register of known # non-compliant agents at: https://robotsv2.org/transparency # # Site owners can also declare their reporting preferences: # # report-to: https://robotsv2.org/report # report-to: webmaster@yourdomain.com report-to: https://robotsv2.org/report # ══════════════════════════════════════════ # QUICK REFERENCE — ALL DIRECTIVES # ══════════════════════════════════════════ # # PATH RULES (standard robots.txt syntax): # User-agent: [name | *] # Allow: [/path] # Disallow: [/path] # # AI POLICY DIRECTIVES: # crawl: yes | no | ask # read: yes | no | ask # summarise: yes | no | ask # quote: yes | no | short-only # derivative: yes | no | ask ← NEW in v0.2 # train: yes | no | ask # store: yes | no | session-only # compete: yes | no # market: [freeform string] ← NEW in v0.2 # personalise: yes | no # monetise: yes | no | ask # attribution: required | preferred | none # link-back: required | preferred | none # rate: [integer] | polite # announce: yes | no # honest: yes # # CONTENT QUALITY SIGNALS: # content-type: opinion | news | reference | satire | commercial | research | personal # editorialised: yes | no | partial # ai-assisted: yes | no | partial # primary-language: [IETF tag e.g. en-AU, en-US, fr, de] # # AGENT OVERRIDE BLOCKS: # [agent: category] apply rules to specific agent category # # Standard agent categories: # search-indexer | ai-assistant | ai-researcher | code-assistant # data-harvester | content-generator | ad-network | monitoring # # ASK PROTOCOL: # HEAD /.well-known/robots2-ask?directive=[name]&agent=[identity] # Response header: X-Robots2-Decision: allow | deny | allow-once # Optional scope: X-Robots2-Scope: /path/* ← NEW in v0.2.1 # 404 = deny (safe default) # # COMPLIANCE VERIFICATION: # report-to: [URL or email] ← NEW in v0.2 # # META (top of file, comment lines): # # meta: spec-version: # # meta: last-update: [YYYY-MM-DD HH:MM UTC] # # meta: update-frequency:[live | daily | weekly | monthly | rarely | static] # # meta: significant-change: [yes | no] # # meta: authored-by: # # meta: contact: # # meta: jurisdiction: # # meta: licence: # # meta: chain-id: [unique identifier for loop detection] # # CHAIN (last line only — ignored if not last): # chain: [url to next robots2.txt file] # # CHAIN SAFETY: # Agents MUST limit chain resolution to a maximum depth of 3. # If a chain loop is detected (File A → File B → File A), # the agent MUST stop at the first repeated file and use # the directives it has already collected. # Deeper chains are ignored, not an error. # Use # meta: chain-id: to help agents detect loops by identity # rather than URL alone (handles CDN mirrors, redirects, etc.) # # ───────────────────────────────────────── # CHANGELOG # ───────────────────────────────────────── # v0.2.1 — 2026-04-07 # + Added scoped ask responses (X-Robots2-Scope header) # + Added chain depth limit (max 3) and loop detection # + Added meta: chain-id for loop detection across CDN mirrors # + Clarified store: session-only — explicitly prohibits vector stores, # RAG indices, and any persistent storage # + Clarified quote: short-only — 50-word limit excludes attribution text # + Gemini joined as co-author and contributor # # v0.2 — 2026-04-06 # + Added derivative: directive (rewrite/paraphrase/translate) # + Added market: directive (supports compete: enforcement) # + Added Section 6 — The Ask Protocol (how agents request permission) # + Added Section 7 — Compliance Verification (honeypots + reporting) # + Added report-to: directive # + Expanded agent taxonomy: ai-researcher, code-assistant, # content-generator, ad-network, monitoring # + Added X-Agent-Identity format recommendation with category # + Clarified compete: definition with market category context # # v0.1 — 2026-04-03 # Initial specification # # ───────────────────────────────────────── # END OF FILE # robots2.txt spec v0.2.1 — Matthew, Claude & Gemini — 2026-04-07 # This specification is open. Use it, share it, improve it. # ───────────────────────────────────────── chain: https://robotsv2.org/community-baseline.txt