# =============================================================================
# robots.txt — web.dataportal.live
# =============================================================================
# Policy: OPEN for both search indexing and AI training/inference.
# We want DataPortal cited in LLM answers AND ranked in search results.
#
# THIS FILE IS A LIVING DOCUMENT — review monthly.
# New AI crawlers appear regularly. Add them here as they emerge.
#
# Last reviewed: 2026-04-30
# Next review:   2026-05-30
#
# Maintenance references (check these for new bot user-agents):
#   - https://darkvisitors.com/agents              (community-maintained AI bot list)
#   - https://platform.openai.com/docs/bots         (GPTBot, ChatGPT-User, OAI-SearchBot)
#   - https://docs.anthropic.com/en/docs/build-with-claude/agent-capabilities (ClaudeBot)
#   - https://developers.google.com/search/docs/crawling-indexing/google-common-crawlers
#   - https://www.perplexity.ai/perplexitybot
#   - https://docs.apple.com/en-us/applebot
#   - https://commoncrawl.org/ccbot
#
# When updating: keep alphabetical inside each section, bump "Last reviewed" date,
# rebuild + deploy.
# =============================================================================

# ---- Search engine crawlers ----
User-agent: Googlebot
Allow: /

User-agent: Googlebot-Image
Allow: /

User-agent: Googlebot-News
Allow: /

User-agent: Bingbot
Allow: /

User-agent: DuckDuckBot
Allow: /

User-agent: Slurp
Allow: /

User-agent: YandexBot
Allow: /

User-agent: Baiduspider
Allow: /

User-agent: Applebot
Allow: /

User-agent: Sogou
Allow: /

User-agent: Naverbot
Allow: /

User-agent: SeznamBot
Allow: /

# ---- AI training, retrieval & answer-engine crawlers (explicitly ALLOWED) ----

# Anthropic
User-agent: ClaudeBot
Allow: /

User-agent: Claude-Web
Allow: /

User-agent: anthropic-ai
Allow: /

User-agent: Claude-User
Allow: /

User-agent: Claude-SearchBot
Allow: /

# OpenAI
User-agent: GPTBot
Allow: /

User-agent: ChatGPT-User
Allow: /

User-agent: OAI-SearchBot
Allow: /

# Google AI (Gemini / Vertex training corpus)
User-agent: Google-Extended
Allow: /

User-agent: GoogleOther
Allow: /

# Perplexity
User-agent: PerplexityBot
Allow: /

User-agent: Perplexity-User
Allow: /

# Apple Intelligence
User-agent: Applebot-Extended
Allow: /

# Common Crawl (powers many open LLM datasets)
User-agent: CCBot
Allow: /

# Meta / Facebook AI
User-agent: FacebookBot
Allow: /

User-agent: Meta-ExternalAgent
Allow: /

User-agent: Meta-ExternalFetcher
Allow: /

# ByteDance / TikTok / Doubao
User-agent: Bytespider
Allow: /

User-agent: Doubao
Allow: /

# Cohere
User-agent: cohere-ai
Allow: /

User-agent: cohere-training-data-crawler
Allow: /

# You.com
User-agent: YouBot
Allow: /

# Diffbot
User-agent: Diffbot
Allow: /

# Mistral
User-agent: MistralAI-User
Allow: /

# Amazon
User-agent: Amazonbot
Allow: /

# DuckDuckGo AI
User-agent: DuckAssistBot
Allow: /

# Phind
User-agent: PhindBot
Allow: /

# Kagi
User-agent: KagiBot
Allow: /

# Brave Search
User-agent: Bravebot
Allow: /

# Webz.io / news aggregation
User-agent: Webzio-Extended
Allow: /

# Timpi
User-agent: TimpiBot
Allow: /

# Velen
User-agent: Velen-Crawler
Allow: /

# Omgili (generative search)
User-agent: omgili
Allow: /

# Awario / brand monitoring
User-agent: AwarioRssBot
Allow: /

User-agent: AwarioSmartBot
Allow: /

# Scaleserp / SerpAPI
User-agent: SerpApiBot
Allow: /

# ---- Default policy ----
# Anything not listed above is allowed by default.
# Block only generated/internal paths.
User-agent: *
Allow: /
Disallow: /api/
Disallow: /_astro/
Disallow: /functions/

# ---- Crawl rate hint (for polite crawlers) ----
Crawl-delay: 1

# ---- Sitemap ----
Sitemap: https://web.dataportal.live/sitemap-index.xml