diff --git a/unified-doc/build-docs.sh b/unified-doc/build-docs.sh
index 80d5b347..a5cc9fca 100755
--- a/unified-doc/build-docs.sh
+++ b/unified-doc/build-docs.sh
@@ -17,6 +17,8 @@
 #   --zlan-branch=BRANCH         Branch for netfoundry/zlan                       (default: main)
 #   --platform-branch=BRANCH     Branch for netfoundry/platform-doc               (default: main)
 #   --data-connector-branch=BRANCH  Branch for netfoundry/nf-data-connector      (default: main)
+#   --llm-gateway-branch=BRANCH  Branch for openziti/llm-gateway                  (default: main)
+#   --mcp-gateway-branch=BRANCH  Branch for openziti/mcp-gateway                  (default: main)
 #   --clean                      Wipe _remotes and .docusaurus cache before building
 #   --lint-only                  Run lint checks only; skip build
 #   --qualifier=VALUE            Append VALUE to output dir (e.g. --qualifier=-preview -> build-preview)
@@ -34,7 +36,7 @@
 #   BB_USERNAME                  Bitbucket username (default: x-token-auth)
 #   DOCUSAURUS_BUILD_MASK        Hex bitmask: 0x1=openziti 0x2=frontdoor 0x4=selfhosted
 #                                             0x8=zrok 0x10=zlan 0x20=platform
-#                                             0x40=data-connector 0xFF=all (default: 0xFF)
+#                                             0x40=data-connector 0x80=llm-gateway 0x100=mcp-gateway 0x1FF=all (default: 0x1FF)
 #   DOCUSAURUS_PUBLISH_ENV       Set to 'prod' to use production Algolia index
 #   NO_MINIFY                    Set to any value to pass --no-minify to Docusaurus
 #   IS_VERCEL                    Set to 'true' on Vercel preview deployments
@@ -62,6 +64,8 @@ BRANCH_SELFHOSTED="main"
 BRANCH_ZLAN="main"
 BRANCH_PLATFORM="main"
 BRANCH_DATA_CONNECTOR="main"
+BRANCH_LLM_GATEWAY="main"
+BRANCH_MCP_GATEWAY="main"
 
 usage() {
   sed -n '/^# USAGE/,/^# =====/{ /^# =====/d; s/^# \{0,1\}//; p }' "$0"
@@ -76,6 +80,8 @@ while [[ $# -gt 0 ]]; do
     --zlan-branch=*)        BRANCH_ZLAN="${1#*=}";        shift ;;
     --platform-branch=*)    BRANCH_PLATFORM="${1#*=}";   shift ;;
     --data-connector-branch=*) BRANCH_DATA_CONNECTOR="${1#*=}"; shift ;;
+    --llm-gateway-branch=*) BRANCH_LLM_GATEWAY="${1#*=}"; shift ;;
+    --mcp-gateway-branch=*) BRANCH_MCP_GATEWAY="${1#*=}"; shift ;;
     --ziti-doc-branch)      BRANCH_ZITI_DOC="${2:?--ziti-doc-branch requires a value}";     shift 2 ;;
     --zrok-branch)          BRANCH_ZROK="${2:?--zrok-branch requires a value}";             shift 2 ;;
     --frontdoor-branch)     BRANCH_FRONTDOOR="${2:?--frontdoor-branch requires a value}";   shift 2 ;;
@@ -83,6 +89,8 @@ while [[ $# -gt 0 ]]; do
     --zlan-branch)          BRANCH_ZLAN="${2:?--zlan-branch requires a value}";             shift 2 ;;
     --platform-branch)      BRANCH_PLATFORM="${2:?--platform-branch requires a value}";     shift 2 ;;
     --data-connector-branch) BRANCH_DATA_CONNECTOR="${2:?--data-connector-branch requires a value}"; shift 2 ;;
+    --llm-gateway-branch)   BRANCH_LLM_GATEWAY="${2:?--llm-gateway-branch requires a value}"; shift 2 ;;
+    --mcp-gateway-branch)   BRANCH_MCP_GATEWAY="${2:?--mcp-gateway-branch requires a value}"; shift 2 ;;
     --clean) CLEAN=1; shift ;;
     --lint-only) LINT_ONLY=1; shift ;;
     -h|--help) usage; exit 0 ;;
@@ -109,6 +117,8 @@ echo "  BRANCH_SELFHOSTED='$BRANCH_SELFHOSTED'"
 echo "  BRANCH_ZLAN='$BRANCH_ZLAN'"
 echo "  BRANCH_PLATFORM='$BRANCH_PLATFORM'"
 echo "  BRANCH_DATA_CONNECTOR='$BRANCH_DATA_CONNECTOR'"
+echo "  BRANCH_LLM_GATEWAY='$BRANCH_LLM_GATEWAY'"
+echo "  BRANCH_MCP_GATEWAY='$BRANCH_MCP_GATEWAY'"
 echo "  CLEAN=$CLEAN"
 echo "  IS_VERCEL='${IS_VERCEL:-}'"
 echo "  node: $(node --version 2>/dev/null || echo 'not found')"
@@ -263,6 +273,8 @@ lint_docs() {
         "${script_dir}/_remotes/openziti/docusaurus/docs"
         "${script_dir}/_remotes/platform/docusaurus/docs"
         "${script_dir}/_remotes/data-connector/docusaurus/docs"
+        "${script_dir}/_remotes/llm-gateway/docusaurus/docs"
+        "${script_dir}/_remotes/mcp-gateway/docusaurus/docs"
     )
 
     # 2. VERIFY FOLDERS
@@ -394,6 +406,8 @@ clone_or_update "https://github.com/netfoundry/zlan.git"
 clone_or_update "https://github.com/openziti/zrok.git"                           zrok       "$BRANCH_ZROK"
 clone_or_update "https://bitbucket.org/netfoundry/platform-doc.git"              platform   "$BRANCH_PLATFORM"
 clone_or_update "https://bitbucket.org/netfoundry/nf-data-connector.git"         data-connector "$BRANCH_DATA_CONNECTOR"
+clone_or_update "https://github.com/openziti/llm-gateway.git"                    llm-gateway    "$BRANCH_LLM_GATEWAY"
+clone_or_update "https://github.com/openziti/mcp-gateway.git"                    mcp-gateway    "$BRANCH_MCP_GATEWAY"
 
 echo "Cleaning stale build artifacts from remotes..."
 find "$script_dir/_remotes" -type d \( -path "*/docusaurus/build" -o -path "*/docusaurus/.docusaurus" -o -path "*/website/build" -o -path "*/website/.docusaurus" \) -exec rm -rf {} + 2>/dev/null || true
diff --git a/unified-doc/docs/llm-gateway/api-keys.md b/unified-doc/docs/llm-gateway/api-keys.md
deleted file mode 100644
index 690efa6d..00000000
--- a/unified-doc/docs/llm-gateway/api-keys.md
+++ /dev/null
@@ -1,114 +0,0 @@
----
-title: Virtual API keys
-sidebar_label: Virtual API keys
----
-
-# Virtual API keys
-
-The gateway supports virtual API keys — gateway-issued bearer tokens that identify clients and optionally
-restrict what they can access. These are "virtual" because they're not upstream provider keys; they're
-managed entirely by the gateway.
-
-Clients send the key in the standard `Authorization: Bearer <key>` header, matching the convention used
-with OpenAI and other providers. Existing tools (Open WebUI, LiteLLM clients, curl scripts) work without
-changes beyond configuring a key.
-
-## Configuration
-
-Keys are defined in the gateway config file. Each key has a name (for logging and attribution), a secret
-value, and optional constraints:
-
-```yaml
-api_keys:
-  enabled: true
-  keys:
-    - name: alice
-      key: "sk-gw-abc123..."
-      allowed_models: ["claude-*", "gpt-*"]
-      allowed_routes: ["coding", "general"]
-
-    - name: bob
-      key: "sk-gw-def456..."
-      allowed_models: ["llama3", "qwen3-vl:*"]
-
-    - name: ci-pipeline
-      key: "sk-gw-xyz789..."
-      allowed_models: ["*"]
-```
-
-When `api_keys` is omitted or `enabled: false`, the gateway operates without authentication — open access.
-
-Keys support environment variable substitution:
-
-```yaml
-keys:
-  - name: alice
-    key: "${ALICE_API_KEY}"
-```
-
-## Key format
-
-Keys use the prefix `sk-gw-` to distinguish them from upstream provider keys (OpenAI `sk-`, Anthropic
-`sk-ant-`). Generate a key with the CLI:
-
-```bash
-llm-gateway genkey
-```
-
-```text title="Output"
-sk-gw-a1b2c3d4e5f6...
-```
-
-Keys are stored as plaintext in the config file, consistent with how upstream API keys are stored.
-
-## Authentication flow
-
-Every incoming request passes through the auth middleware before reaching any handler:
-
-```
-Client request
-    |
-    v
-Auth middleware
-    |-- /health, /metrics -> pass through (no auth required)
-    |-- api_keys disabled -> pass through
-    |-- Authorization header missing -> 401
-    |-- Key not recognized -> 401
-    |-- Key valid -> attach identity to context, continue
-    |
-    v
-Existing handler pipeline (unchanged)
-```
-
-## Model restrictions
-
-Each key can specify `allowed_models` as a list of glob patterns (e.g., `claude-*` matches any Claude
-model). A key with `allowed_models: ["*"]` or no `allowed_models` field has unrestricted access.
-
-Model permission is checked after the model is fully resolved (including semantic routing) but before
-the request is dispatched to a provider. If the resolved model doesn't match any allowed pattern, the
-gateway returns 403.
-
-## Route restrictions
-
-When semantic routing is enabled, a key can specify `allowed_routes` to limit which semantic routes it
-can use. If semantic routing selects a route the key can't access, the gateway returns 403 — it doesn't
-silently reroute to a fallback.
-
-## Error responses
-
-Errors follow the OpenAI-compatible format:
-
-| Scenario | Status | Error type |
-|---|---|---|
-| Missing `Authorization` header | 401 | `authentication_error` |
-| Invalid key | 401 | `authentication_error` |
-| Model not allowed | 403 | `permission_error` |
-| Route not allowed | 403 | `permission_error` |
-
-## Logging and metrics
-
-The validated key name is included in:
-
-- **Semantic routing log lines**: `semantic routing: key='alice' method=semantic route='coding' ...`
-- **Request metrics**: `key` label on `llm_gateway.requests` and `llm_gateway.request.duration`
diff --git a/unified-doc/docs/llm-gateway/configuration.md b/unified-doc/docs/llm-gateway/configuration.md
deleted file mode 100644
index 3e2c6d7f..00000000
--- a/unified-doc/docs/llm-gateway/configuration.md
+++ /dev/null
@@ -1,194 +0,0 @@
----
-title: Configuration reference
-sidebar_label: Configuration
----
-
-# Configuration reference
-
-NetFoundry LLM Gateway is configured with a YAML file. CLI flags can override individual settings.
-
-## Gateway settings
-
-Controls the listen address for the gateway process:
-
-```yaml
-listen: ":8080"   # address to listen on (default: :8080)
-```
-
-To expose the gateway over a zrok overlay instead of a local port, add a top-level `zrok:` block:
-
-```yaml
-zrok:
-  share:
-    enabled: false
-    mode: private
-    token: ""
-```
-
-## Providers
-
-Configure which inference providers the gateway can route to:
-
-```yaml
-providers:
-  open_ai:
-    api_key: ${OPENAI_API_KEY}    # supports environment variable expansion
-
-  anthropic:
-    api_key: ${ANTHROPIC_API_KEY}
-
-  local:
-    base_url: http://localhost:11434
-```
-
-## Virtual API keys
-
-Restrict client access with named keys and per-key model permissions:
-
-```yaml
-api_keys:
-  enabled: true
-  keys:
-    - name: alice
-      key: ${ALICE_KEY}
-      allowed_models: ["gpt-*", "claude-*"]
-    - name: bob
-      key: ${BOB_KEY}
-      allowed_models: ["llama*"]
-```
-
-See [Virtual API keys](api-keys.md) for a full reference.
-
-## Routing
-
-Enable semantic routing and define named routes:
-
-```yaml
-routing:
-  default_route: general
-  semantic:
-    enabled: true
-    provider: local
-    model: nomic-embed-text
-    threshold: 0.75
-    ambiguous_threshold: 0.5
-  routes:
-    - name: coding
-      model: claude-haiku-4-5-20251001
-      description: "code generation, debugging, and technical tasks"
-      examples:
-        - "write a python function to sort a list"
-```
-
-See [Semantic routing](semantic-routing.md) for a full reference.
-
-## Metrics
-
-Expose a Prometheus metrics endpoint:
-
-```yaml
-metrics:
-  enabled: true
-```
-
-## Tracing
-
-Enable request body logging for debugging routing decisions:
-
-```yaml
-tracing:
-  enabled: true
-  max_content_length: 200   # max characters per message in log output
-```
-
-When enabled, each chat completion request is logged with the model, message count, streaming flag,
-tool count, and each message's role and truncated content.
-
-## Environment variables
-
-String values support `${VAR_NAME}` expansion. Variables are expanded at startup:
-
-```bash
-export OPENAI_API_KEY=sk-...
-export ANTHROPIC_API_KEY=sk-ant-...
-llm-gateway run config.yaml
-```
-
-## Complete example
-
-A full configuration combining all sections:
-
-```yaml
-listen: "0.0.0.0:8080"
-
-zrok:
-  share:
-    enabled: true
-    token: ${ZROK_SHARE_TOKEN}
-
-api_keys:
-  enabled: true
-  keys:
-    - name: primary
-      key: ${PRIMARY_API_KEY}
-      allowed_models: ["gpt-*", "claude-*", "llama*"]
-
-providers:
-  open_ai:
-    api_key: ${OPENAI_API_KEY}
-
-  anthropic:
-    api_key: ${ANTHROPIC_API_KEY}
-
-  local:
-    base_url: http://localhost:11434
-
-routing:
-  default_route: general
-  semantic:
-    enabled: true
-    provider: local
-    model: nomic-embed-text
-    threshold: 0.75
-
-metrics:
-  enabled: true
-```
-
-## Run the gateway
-
-Pass the config file path as the first argument:
-
-```bash
-llm-gateway run config.yaml
-```
-
-## CLI flags
-
-```
-llm-gateway run <config-path> [flags]
-
-Flags:
-  --address string   Gateway listen address (e.g., 0.0.0.0:8080)
-  --zrok             Enable zrok share (boolean)
-  --zrok-mode string Zrok share mode (private or public)
-  -h, --help         Show help
-```
-
-CLI flags take precedence over the config file.
-
-## Startup sequence
-
-When the gateway starts, it:
-
-1. Loads and parses the YAML config file.
-2. Applies any CLI flag overrides.
-3. Expands environment variables.
-4. Initializes providers (OpenAI, Anthropic, local/self-hosted) in order.
-5. Creates the model-to-provider router.
-6. Initializes OpenTelemetry metrics (if enabled).
-7. Initializes the semantic router (if configured).
-8. Starts the HTTP server (local or via zrok share).
-
-On shutdown (SIGINT/SIGTERM), the gateway closes all providers, deletes ephemeral zrok shares, and
-releases zrok access objects before exiting.
diff --git a/unified-doc/docs/llm-gateway/connect-zrok.md b/unified-doc/docs/llm-gateway/connect-zrok.md
deleted file mode 100644
index 19cf848b..00000000
--- a/unified-doc/docs/llm-gateway/connect-zrok.md
+++ /dev/null
@@ -1,109 +0,0 @@
----
-title: Connect via zrok
-sidebar_label: Connect via zrok
----
-
-# Connect via zrok
-
-The gateway uses [zrok](https://zrok.io) in two independent ways:
-
-- **Sharing**: Exposes the gateway over a zrok share so clients can reach it without a public IP or
-  open ports.
-- **Accessing**: Connects to backend providers through zrok shares instead of direct HTTP.
-
-Both use zrok's overlay network built on [OpenZiti](https://openziti.io).
-
-## Prerequisites
-
-The gateway requires a zrok environment on the host machine. If `zrok enable` hasn't been run, the
-gateway fails at startup:
-
-```
-zrok environment is not enabled; run 'zrok enable' first
-```
-
-This applies to both sharing and accessing.
-
-## Share the gateway
-
-Instead of listening on a TCP port, the gateway can serve traffic through a zrok share. Clients connect
-to the share token rather than an IP address.
-
-### Ephemeral shares
-
-An ephemeral share is created at startup and deleted when the gateway shuts down.
-
-1. Add the zrok config to `config.yaml`:
-
-    ```yaml
-    zrok:
-      share:
-        enabled: true
-        mode: private    # or public
-    ```
-
-    Alternatively, pass flags at runtime:
-
-    ```bash
-    llm-gateway run config.yaml --zrok --zrok-mode private
-    ```
-
-2. Start the gateway. The share token is logged at startup:
-
-    ```
-    serving via zrok share 'abc123def456'
-    ```
-
-3. Give clients the share token to connect.
-
-**Public mode** creates a share accessible by anyone with the token. **Private mode** (the default)
-requires the client to have a zrok environment enabled and creates an access-controlled connection
-through the overlay.
-
-### Persistent shares
-
-Ephemeral shares get a new token on every restart. For a stable token, create a persistent share with
-`zrok reserve` and pass its token to the gateway:
-
-```yaml
-zrok:
-  share:
-    enabled: true
-    token: "abc123"    # existing persistent share token
-```
-
-Persistent shares are always private. The gateway connects to the existing share but doesn't delete it
-on shutdown — the share is managed externally.
-
-## Access providers via zrok
-
-Any provider can be reached through a zrok share by setting `zrok_share_token` in its config. This is
-useful when a provider runs on a different machine that isn't directly reachable over the network but
-is connected to the same zrok environment:
-
-```yaml
-providers:
-  local:
-    zrok_share_token: "remote-ollama-token"
-
-  anthropic:
-    api_key: "${ANTHROPIC_API_KEY}"
-    zrok_share_token: "anthropic-proxy-token"
-```
-
-### Multi-endpoint
-
-Each endpoint can independently use zrok or direct HTTP:
-
-```yaml
-providers:
-  local:
-    endpoints:
-      - name: local
-        base_url: "http://localhost:11434"
-      - name: remote-gpu
-        zrok_share_token: "gpu-box-token"
-```
-
-Each endpoint with a `zrok_share_token` gets its own zrok access and HTTP client. The round-robin
-load balancer uses whichever transport is configured per endpoint.
diff --git a/unified-doc/docs/llm-gateway/get-started.md b/unified-doc/docs/llm-gateway/get-started.md
deleted file mode 100644
index 41ca409f..00000000
--- a/unified-doc/docs/llm-gateway/get-started.md
+++ /dev/null
@@ -1,363 +0,0 @@
----
-title: Get started with NetFoundry LLM Gateway
-sidebar_label: Get started
----
-
-# Get started
-
-This guide walks you through installing NetFoundry LLM Gateway and running your first requests. By the
-end, you'll have the gateway proxying requests to one or more inference providers.
-
-## Installation
-
-Choose the installation method that fits your environment.
-
-### Pre-built binaries
-
-Pre-built binaries are available for Linux, macOS, and Windows:
-
-1. Visit the [GitHub Releases](https://github.com/openziti/llm-gateway/releases) page.
-2. Download the binary for your platform.
-3. Make it executable:
-
-   ```bash
-   chmod +x llm-gateway
-   ```
-
-4. Run it:
-
-   ```bash
-   ./llm-gateway run config.yaml
-   ```
-
-### Install with Go
-
-If you have Go 1.22 or later:
-
-```bash
-go install github.com/openziti/llm-gateway/cmd/llm-gateway@latest
-llm-gateway run config.yaml
-```
-
-### Build from source
-
-Clone the repository and build the binary locally:
-
-```bash
-git clone https://github.com/openziti/llm-gateway.git
-cd llm-gateway
-go build -o llm-gateway ./cmd/llm-gateway
-./llm-gateway run config.yaml
-```
-
-## Examples
-
-The examples below progress from a simple single-provider proxy to a full production configuration.
-
-### Proxy a local inference server
-
-1. Start Ollama:
-
-    ```bash
-    ollama serve
-    ```
-
-2. Create `config.yaml`:
-
-    ```yaml
-    local:
-      base_url: http://localhost:11434
-    ```
-
-3. Start the gateway:
-
-    ```bash
-    llm-gateway run config.yaml
-    ```
-
-4. Send a request:
-
-    ```bash
-    curl -X POST http://localhost:8080/v1/chat/completions \
-      -H "Content-Type: application/json" \
-      -d '{
-        "model": "llama2",
-        "messages": [{"role": "user", "content": "Hello"}],
-        "temperature": 0.7
-      }'
-    ```
-
-The gateway listens on `http://localhost:8080` by default.
-
-### Route between OpenAI and Anthropic
-
-The gateway routes requests to the correct provider by prefix-matching on the model name:
-
-```yaml
-providers:
-  open_ai:
-    api_key: ${OPENAI_API_KEY}
-
-  anthropic:
-    api_key: ${ANTHROPIC_API_KEY}
-
-  local:
-    base_url: http://localhost:11434
-```
-
-Requests are routed automatically based on the model prefix: `gpt-*` goes to OpenAI, `claude-*` to
-Anthropic, everything else to the local provider:
-
-```bash
-# Routes to OpenAI
-curl -X POST http://localhost:8080/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{"model": "gpt-4", "messages": [{"role": "user", "content": "Hello from OpenAI"}]}'
-
-# Routes to Anthropic
-curl -X POST http://localhost:8080/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{"model": "claude-3-sonnet-20240229", "messages": [{"role": "user", "content": "Hello from Anthropic"}]}'
-
-# Routes to local Ollama
-curl -X POST http://localhost:8080/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{"model": "llama2", "messages": [{"role": "user", "content": "Hello from Ollama"}]}'
-```
-
-### Restrict API access with virtual keys
-
-1. Generate an API key:
-
-    ```bash
-    llm-gateway genkey
-    # sk-gw-a1b2c3d4e5f6...
-    ```
-
-2. Add `api_keys` to your config, referencing the key and setting per-key model permissions:
-
-    ```yaml
-    api_keys:
-      enabled: true
-      keys:
-        - name: primary
-          key: ${PRIMARY_API_KEY}
-          allowed_models: ["gpt-*", "claude-*"]
-        - name: local-only
-          key: ${LOCAL_API_KEY}
-          allowed_models: ["llama*"]
-
-    providers:
-      open_ai:
-        api_key: ${OPENAI_API_KEY}
-
-      anthropic:
-        api_key: ${ANTHROPIC_API_KEY}
-
-      local:
-        base_url: http://localhost:11434
-    ```
-
-3. Clients send their key in the `Authorization` header:
-
-    ```bash
-    curl -X POST http://localhost:8080/v1/chat/completions \
-      -H "Content-Type: application/json" \
-      -H "Authorization: Bearer sk-gw-a1b2c3d4e5f6..." \
-      -d '{"model": "gpt-4", "messages": [{"role": "user", "content": "Hello"}]}'
-    ```
-
-### Use the Python OpenAI client
-
-The gateway works as a drop-in replacement for the OpenAI Python client. Point `base_url` at the
-gateway and it handles provider routing transparently:
-
-```python
-from openai import OpenAI
-
-client = OpenAI(
-    base_url="http://localhost:8080/v1",
-    api_key="not-needed"  # gateway handles auth
-)
-
-# Routes to OpenAI
-response = client.chat.completions.create(
-    model="gpt-4o",
-    messages=[{"role": "user", "content": "Hello!"}]
-)
-
-# Routes to Anthropic (translated automatically)
-response = client.chat.completions.create(
-    model="claude-sonnet-4-20250514",
-    messages=[{"role": "user", "content": "Hello!"}]
-)
-
-# Routes to local backend (Ollama, vLLM, etc.)
-response = client.chat.completions.create(
-    model="llama3.2",
-    messages=[{"role": "user", "content": "Hello!"}]
-)
-```
-
-### Semantic routing
-
-Route requests automatically based on content analysis, without requiring clients to specify a model:
-
-```yaml
-routing:
-  default_route: general
-  semantic:
-    enabled: true
-    provider: local
-    model: nomic-embed-text
-    threshold: 0.75
-    ambiguous_threshold: 0.5
-  routes:
-    - name: coding
-      model: claude-haiku-4-5-20251001
-      description: "code generation, debugging, and technical tasks"
-      examples:
-        - "write a python function to sort a list"
-        - "debug this segfault in my C code"
-    - name: general
-      model: qwen3-vl:30b
-      description: "general knowledge and conversation"
-      examples:
-        - "what is the capital of France"
-        - "explain how photosynthesis works"
-
-providers:
-  local:
-    base_url: http://localhost:11434
-```
-
-See [Semantic routing](semantic-routing.md) for a full explanation of how routing works.
-
-### Multi-endpoint load balancing
-
-Distribute requests across multiple inference backends:
-
-```yaml
-providers:
-  local:
-    endpoints:
-      - name: ollama-primary
-        base_url: http://localhost:11434
-        weight: 2
-      - name: ollama-secondary
-        base_url: http://localhost:11435
-        weight: 1
-      - name: vllm-endpoint
-        base_url: http://vllm.example.com:8000
-        weight: 1
-
-  open_ai:
-    api_key: ${OPENAI_API_KEY}
-
-  anthropic:
-    api_key: ${ANTHROPIC_API_KEY}
-```
-
-See [Multi-endpoint load balancing](multi-endpoint.md) for health check and failover options.
-
-### Connect via zrok
-
-Share the gateway over a zrok overlay so clients can reach it without a public IP:
-
-```yaml
-zrok:
-  share:
-    enabled: true
-    mode: private
-
-providers:
-  local:
-    base_url: http://localhost:11434
-```
-
-Or access a remote inference backend through a zrok share:
-
-```yaml
-providers:
-  local:
-    endpoints:
-      - name: remote-ollama
-        zrok_share_token: ${ZROK_OLLAMA_TOKEN}
-```
-
-See [Connect via zrok](connect-zrok.md) for setup details.
-
-### Production configuration
-
-A full configuration combining multiple providers, API key authentication, semantic routing, load
-balancing, metrics, and zrok:
-
-```yaml
-listen: "0.0.0.0:8080"
-
-zrok:
-  share:
-    enabled: true
-    token: ${ZROK_SHARE_TOKEN}
-
-api_keys:
-  enabled: true
-  keys:
-    - name: primary
-      key: ${PRIMARY_API_KEY}
-      allowed_models: ["gpt-*", "claude-*", "llama*"]
-    - name: local-only
-      key: ${LOCAL_API_KEY}
-      allowed_models: ["llama*"]
-
-providers:
-  open_ai:
-    api_key: ${OPENAI_API_KEY}
-
-  anthropic:
-    api_key: ${ANTHROPIC_API_KEY}
-
-  local:
-    endpoints:
-      - name: ollama-primary
-        base_url: http://localhost:11434
-        weight: 3
-      - name: ollama-secondary
-        base_url: http://localhost:11435
-        weight: 1
-      - name: vllm-endpoint
-        zrok_share_token: ${ZROK_VLLM_TOKEN}
-        weight: 2
-
-routing:
-  default_route: general
-  semantic:
-    enabled: true
-    provider: local
-    model: nomic-embed-text
-    threshold: 0.75
-    ambiguous_threshold: 0.5
-  routes:
-    - name: coding
-      model: claude-haiku-4-5-20251001
-      description: "code generation, debugging, and technical tasks"
-      examples:
-        - "write a python function to sort a list"
-    - name: general
-      model: qwen3-vl:30b
-      description: "general knowledge and conversation"
-      examples:
-        - "what is the capital of France"
-
-metrics:
-  enabled: true
-```
-
-## More info
-
-- [Configuration](configuration.md): All configuration options
-- [Providers](providers.md): How provider routing and format translation work
-- [Multi-endpoint load balancing](multi-endpoint.md): Advanced load balancing strategies
-- [Virtual API keys](api-keys.md): Client authentication and model-level restrictions
-- [Semantic routing](semantic-routing.md): Intelligent request routing based on content
-- [Metrics](metrics.md): Prometheus metrics and observability
diff --git a/unified-doc/docs/llm-gateway/intro.md b/unified-doc/docs/llm-gateway/intro.md
deleted file mode 100644
index be10c234..00000000
--- a/unified-doc/docs/llm-gateway/intro.md
+++ /dev/null
@@ -1,48 +0,0 @@
----
-title: NetFoundry LLM Gateway overview
-sidebar_label: Overview
-description: >
-  NetFoundry LLM Gateway is an OpenAI-compatible API proxy that routes requests across multiple LLM
-  providers using zero-trust networking over OpenZiti.
----
-
-# NetFoundry LLM Gateway overview
-
-NetFoundry LLM Gateway is an OpenAI-compatible API proxy that routes requests across multiple LLM
-providers using zero-trust networking. The project is open source and can be found at:
-[github.com/openziti/llm-gateway](https://github.com/openziti/llm-gateway).
-
-## What it does
-
-It handles provider routing, format translation, and zero-trust networking so clients interact with a
-single OpenAI-compatible endpoint regardless of which model or provider handles the request.
-
-- **Multi-provider routing**: Routes requests to OpenAI, Anthropic, and any OpenAI-compatible backend
-  (Ollama, vLLM, llama-server, SGLang, etc.) by prefix-matching on the model name.
-- **Zero-trust networking**: Uses zrok over OpenZiti overlay networks to connect to backends across NAT
-  and air-gapped environments — no firewall configuration needed.
-- **Semantic routing**: A three-layer cascade (keyword heuristics → embedding similarity → LLM classifier)
-  automatically selects the right model when the client omits a model name.
-- **Load balancing**: Weighted round-robin across multiple inference servers with health checks and passive
-  failover.
-- **Single binary**: One Go binary, one YAML config file — no database, message queue, or sidecar.
-
-## API endpoints
-
-The gateway exposes standard OpenAI-compatible endpoints:
-
-| Endpoint | Description |
-|---|---|
-| `POST /v1/chat/completions` | Chat completions (streaming and non-streaming) |
-| `GET /v1/models` | List available models from all providers |
-| `GET /health` | Health check |
-| `GET /metrics` | Prometheus metrics (when enabled) |
-
-Streaming works via Server-Sent Events across all providers. Anthropic requests are automatically
-translated to and from OpenAI format, so existing tools that speak OpenAI work without changes.
-
-## Observability
-
-Prometheus metrics track request volume, latency, token usage, routing decisions, and endpoint health.
-Per-request body logging is available for debugging routing behavior.
-
diff --git a/unified-doc/docs/llm-gateway/metrics.md b/unified-doc/docs/llm-gateway/metrics.md
deleted file mode 100644
index 8f1cfb95..00000000
--- a/unified-doc/docs/llm-gateway/metrics.md
+++ /dev/null
@@ -1,163 +0,0 @@
----
-title: Metrics
-sidebar_label: Metrics
----
-
-# Metrics
-
-The gateway exposes OpenTelemetry metrics via a Prometheus exporter. When enabled, metrics are available
-at `GET /metrics` in the standard Prometheus text format.
-
-## Enabling metrics
-
-Add the following to your config file:
-
-```yaml
-metrics:
-  enabled: true
-```
-
-## Instruments
-
-All metric names are prefixed with `llm_gateway.`.
-
-### Request metrics
-
-These metrics track individual requests through the gateway:
-
-**`llm_gateway.requests`** (counter): Total chat completion requests.
-
-| Attribute | Values | Description |
-|---|---|---|
-| `provider` | `openai`, `anthropic`, `ollama` | Which provider handled the request |
-| `model` | Model name | The model used |
-| `streaming` | `true`, `false` | Whether the request was streaming |
-| `key` | Key name or empty | The API key name (when [virtual API keys](api-keys.md) are enabled) |
-
----
-
-**`llm_gateway.request.duration`** (histogram, seconds): End-to-end request duration including
-upstream provider latency.
-
-| Attribute | Values | Description |
-|---|---|---|
-| `provider` | `openai`, `anthropic`, `ollama` | Which provider handled the request |
-| `model` | Model name | The model used |
-| `key` | Key name or empty | The API key name (when [virtual API keys](api-keys.md) are enabled) |
-
----
-
-**`llm_gateway.requests.inflight`** (up-down counter): Number of requests currently being processed.
-Incremented when a request enters the handler, decremented when it completes. Useful for understanding
-concurrency and detecting request pileups. No attributes.
-
-### Token metrics
-
-These metrics track token consumption as reported by each provider:
-
-**`llm_gateway.tokens.prompt`** (counter): Total prompt (input) tokens across all requests.
-
-| Attribute | Values | Description |
-|---|---|---|
-| `provider` | Provider name | Which provider reported the usage |
-| `model` | Model name | The model used |
-
----
-
-**`llm_gateway.tokens.completion`** (counter): Total completion (output) tokens across all requests.
-
-| Attribute | Values | Description |
-|---|---|---|
-| `provider` | Provider name | Which provider reported the usage |
-| `model` | Model name | The model used |
-
-Token metrics are recorded from the `usage` field in non-streaming responses. Streaming responses
-typically don't include token counts.
-
-### Routing metrics
-
-This metric tracks how routing decisions are distributed across the cascade layers:
-
-**`llm_gateway.routing.decisions`** (counter): Semantic routing decisions, counted each time the
-router selects a model.
-
-| Attribute | Values | Description |
-|---|---|---|
-| `method` | `explicit`, `heuristic`, `semantic`, `classifier`, `default` | Which routing layer made the decision |
-
-A high proportion of `default` decisions may indicate that thresholds are too strict or that route
-examples don't cover your traffic well.
-
-### Error metrics
-
-This metric tracks errors returned by upstream providers, broken down by error category:
-
-**`llm_gateway.provider.errors`** (counter): Errors returned by upstream providers.
-
-| Attribute | Values | Description |
-|---|---|---|
-| `error_type` | `invalid_request_error`, `authentication_error`, `rate_limit_error`, `server_error`, `not_found_error`, `service_unavailable`, `unknown` | The error category |
-
-### Health metrics
-
-This metric tracks endpoint availability in multi-endpoint mode:
-
-**`llm_gateway.endpoint.healthy`** (up-down counter): Per-endpoint health status. Value is `1` for
-healthy endpoints and `0` for unhealthy endpoints.
-
-| Attribute | Values | Description |
-|---|---|---|
-| `endpoint` | Endpoint name | The endpoint being reported on |
-
-## Prometheus scraping
-
-Point your Prometheus instance at the gateway's `/metrics` endpoint:
-
-```yaml
-# prometheus.yml
-scrape_configs:
-  - job_name: llm-gateway
-    scrape_interval: 15s
-    static_configs:
-      - targets: ["localhost:8080"]
-```
-
-## Useful queries
-
-Some example PromQL queries to get started:
-
-- Requests per minute by provider:
-
-  ```promql
-  rate(llm_gateway_requests_total[5m]) * 60
-  ```
-
-- Average request duration by model:
-
-  ```promql
-  rate(llm_gateway_request_duration_seconds_sum[5m]) / rate(llm_gateway_request_duration_seconds_count[5m])
-  ```
-
-- Token throughput (tokens per second):
-
-  ```promql
-  rate(llm_gateway_tokens_prompt_total[5m]) + rate(llm_gateway_tokens_completion_total[5m])
-  ```
-
-- Error rate as a percentage of total requests:
-
-  ```promql
-  rate(llm_gateway_provider_errors_total[5m]) / rate(llm_gateway_requests_total[5m]) * 100
-  ```
-
-- Routing method distribution:
-
-  ```promql
-  rate(llm_gateway_routing_decisions_total[5m])
-  ```
-
-- Current in-flight requests:
-
-  ```promql
-  llm_gateway_requests_inflight
-  ```
diff --git a/unified-doc/docs/llm-gateway/multi-endpoint.md b/unified-doc/docs/llm-gateway/multi-endpoint.md
deleted file mode 100644
index f7ad3b5a..00000000
--- a/unified-doc/docs/llm-gateway/multi-endpoint.md
+++ /dev/null
@@ -1,131 +0,0 @@
----
-title: Multi-endpoint load balancing
-sidebar_label: Multi-endpoint load balancing
----
-
-# Multi-endpoint load balancing
-
-When you have multiple inference backends, configure the gateway to distribute requests across them
-with automatic health checking and failover.
-
-## Supported backends
-
-The gateway works with any OpenAI-compatible backend:
-
-- Ollama
-- vLLM
-- llama.cpp
-- SGLang
-- Any server implementing `POST /v1/chat/completions`
-
-## Configuration
-
-Instead of a single `base_url`, define an `endpoints` list:
-
-```yaml
-providers:
-  local:
-    endpoints:
-      - name: ollama-primary
-        base_url: http://localhost:11434
-        weight: 2
-      - name: ollama-secondary
-        base_url: http://localhost:11435
-        weight: 1
-      - name: vllm-endpoint
-        base_url: http://vllm.example.com:8000
-        weight: 1
-```
-
-Each endpoint has:
-
-- **`name`**: A descriptive name for logging and monitoring.
-- **`base_url`**: Direct HTTP access to the backend, or use `zrok_share_token` for overlay network access.
-- **`weight`**: Controls traffic distribution proportion. Optional, default `1`.
-
-The `local` key is the section name — it doesn't restrict which backends you can use. Endpoints can be
-any OpenAI-compatible server: Ollama, vLLM, llama.cpp, SGLang, or any custom server.
-
-To access a remote backend via zrok:
-
-```yaml
-providers:
-  local:
-    endpoints:
-      - name: remote-ollama
-        zrok_share_token: ${ZROK_OLLAMA_TOKEN}
-        weight: 1
-```
-
-## Load balancing
-
-The gateway uses **weighted round-robin** load balancing. An endpoint with `weight: 3` receives roughly
-3× the requests of an endpoint with `weight: 1`.
-
-`GET /v1/models` returns the deduplicated union of models from all healthy endpoints.
-
-## Health checking
-
-A background process periodically checks endpoint health:
-
-```yaml
-providers:
-  local:
-    health_check:
-      interval_seconds: 30   # check every 30 seconds (default)
-      timeout_seconds: 5     # per-endpoint timeout (default)
-```
-
-The health check probes `/v1/models` (standard OpenAI format) or falls back to `/api/tags` (Ollama).
-
-When an endpoint fails a check, the gateway logs `endpoint 'name' is now unhealthy` and stops sending it
-traffic. When it recovers, it logs `endpoint 'name' is now healthy` and resumes normal traffic. Health
-checks continue at an exponential backoff schedule — 1× interval after the first failure, up to 10×
-after many failures.
-
-If the system detects a long gap since the last health check (for example, after a VM sleep/wake cycle),
-endpoint checks are staggered to avoid flooding the network with simultaneous reconnection attempts.
-
-## Failover
-
-When a request fails due to a network problem (connection refused, timeout, etc.), the gateway retries
-on the next healthy endpoint. Application-level errors (HTTP 400, 404, etc.) don't trigger failover —
-they indicate a problem with the request, not the endpoint.
-
-## Semantic routing integration
-
-When semantic routing uses the local provider in multi-endpoint mode, embedding and classifier requests
-automatically benefit from the same load distribution and failover via a shared HTTP client.
-No additional configuration is needed.
-
-## Full example
-
-Three local endpoints with weighted distribution, health checking, and a zrok-connected backup alongside cloud providers:
-
-```yaml
-providers:
-  local:
-    endpoints:
-      - name: ollama-primary
-        base_url: http://localhost:11434
-        weight: 3
-      - name: ollama-secondary
-        base_url: http://localhost:11435
-        weight: 1
-      - name: vllm-prod
-        base_url: http://vllm-prod.example.com:8000
-        weight: 2
-      - name: vllm-backup
-        zrok_share_token: ${ZROK_VLLM_BACKUP_TOKEN}
-        weight: 1
-
-    health_check:
-      interval_seconds: 30
-      timeout_seconds: 5
-
-  open_ai:
-    api_key: ${OPENAI_API_KEY}
-
-  anthropic:
-    api_key: ${ANTHROPIC_API_KEY}
-```
diff --git a/unified-doc/docs/llm-gateway/providers.md b/unified-doc/docs/llm-gateway/providers.md
deleted file mode 100644
index 6cbea189..00000000
--- a/unified-doc/docs/llm-gateway/providers.md
+++ /dev/null
@@ -1,157 +0,0 @@
----
-title: Providers
-sidebar_label: Providers
----
-
-# Providers
-
-The gateway presents a single OpenAI-compatible API to clients and translates requests to the
-appropriate backend provider. Three provider types are supported: OpenAI (and compatible APIs),
-Anthropic, and a local/self-hosted provider for any backend that implements `/v1/chat/completions`.
-
-## API surface
-
-All clients interact with the gateway using the [OpenAI chat completions format](https://developers.openai.com/api/reference/chat-completions/overview):
-
-```
-POST /v1/chat/completions    chat completions (streaming and non-streaming)
-GET  /v1/models              list available models from all providers
-GET  /health                 health check
-GET  /metrics                Prometheus metrics (when enabled)
-```
-
-By default, the gateway doesn't require a client API key — authentication is between the gateway and the
-upstream providers. Optionally, the gateway can enforce its own [virtual API keys](api-keys.md).
-
-## Model routing
-
-Models are routed to providers by prefix-matching on the model name:
-
-| Prefix | Provider |
-|---|---|
-| `gpt-*`, `o1-*`, `o3-*` | OpenAI |
-| `claude-*` | Anthropic |
-| Everything else | Local (configured as `local`) |
-
-Matching is case-insensitive. A request for `gpt-4` goes to OpenAI; `claude-haiku-4-5-20251001` goes to
-Anthropic; `llama3` or `qwen3-vl:30b` go to the local provider.
-
-If the target provider isn't configured, the gateway returns an error:
-
-```json
-{"error": {"message": "provider 'openai' is not configured", "type": "invalid_request_error"}}
-```
-
-## OpenAI provider
-
-The OpenAI provider is a direct pass-through. Requests forward to `POST {base_url}/v1/chat/completions`
-with an `Authorization: Bearer` header. Responses are returned unmodified.
-
-Any OpenAI-compatible API can be used as the OpenAI provider by setting `base_url` — for example,
-Azure OpenAI or a local vLLM server.
-
-Model listing calls `GET {base_url}/v1/models`.
-
-## Anthropic provider
-
-The Anthropic provider translates between the OpenAI format and
-[Anthropic's Messages API](https://docs.anthropic.com/en/docs/api-reference/messages/create). Clients
-send OpenAI-format requests and receive OpenAI-format responses regardless of which provider handles
-the request.
-
-### Request translation
-
-The gateway maps OpenAI request fields to their Anthropic equivalents before forwarding:
-
-| OpenAI field | Anthropic field | Notes |
-|---|---|---|
-| `model` | `model` | Passed through |
-| `messages` (role: system) | `system` | First system message becomes Anthropic's top-level `system` field |
-| `messages` (role: user) | `messages` (role: user) | |
-| `messages` (role: assistant) | `messages` (role: assistant) | |
-| `messages` (role: tool) | `messages` (role: user) | Mapped to user role |
-| `max_tokens` | `max_tokens` | Defaults to 4096 if not set (Anthropic requires this field) |
-| `temperature` | `temperature` | |
-| `top_p` | `top_p` | |
-| `stop` | `stop_sequences` | String or array |
-
-### Response translation
-
-The gateway maps Anthropic response fields back to the OpenAI format before returning to the client:
-
-| Anthropic field | OpenAI field | Notes |
-|---|---|---|
-| `id` | `id` | |
-| `content[].text` | `choices[0].message.content` | Text blocks are concatenated |
-| `usage.input_tokens` | `usage.prompt_tokens` | |
-| `usage.output_tokens` | `usage.completion_tokens` | |
-| `stop_reason` | `choices[0].finish_reason` | `end_turn`/`stop_sequence` → `stop`; `max_tokens` → `length` |
-
-### Streaming translation
-
-Anthropic uses a different streaming event format than OpenAI. The gateway translates on the fly:
-
-| Anthropic event | Action |
-|---|---|
-| `message_start` | Captures the message ID for subsequent chunks |
-| `content_block_delta` | Emitted as an OpenAI-format `chat.completion.chunk` with the delta text |
-| `message_delta` | Emitted as a chunk with the `finish_reason` |
-| `message_stop` | Emitted as the `[DONE]` sentinel |
-
-### Model listing
-
-Anthropic doesn't have a public models listing endpoint. The provider returns a static list of current
-and legacy Claude models.
-
-### Error translation
-
-Anthropic error types are mapped to their gateway equivalents:
-
-| Anthropic error type | Gateway error type | HTTP status |
-|---|---|---|
-| `authentication_error` | `authentication_error` | 401 |
-| `rate_limit_error` | `rate_limit_error` | 429 |
-| `invalid_request_error` | `invalid_request_error` | 400 |
-| `not_found_error` | `not_found_error` | 404 |
-| (other) | `server_error` | 500 |
-
-## Local / self-hosted provider
-
-The local provider is a direct pass-through to any OpenAI-compatible backend. Chat completions go to
-`POST {base_url}/v1/chat/completions`. This means Ollama, vLLM, llama.cpp, SGLang, or any server
-exposing this endpoint can be used.
-
-Model listing tries `GET {base_url}/v1/models` first, falling back to Ollama's native
-`GET {base_url}/api/tags`.
-
-For multi-endpoint load balancing and failover, see [Multi-endpoint load balancing](multi-endpoint.md).
-
-## Streaming
-
-All three providers support streaming via Server-Sent Events (SSE). See [Streaming](streaming.md) for
-response format, headers, and how the gateway processes streaming requests.
-
-## Error handling
-
-All providers translate upstream errors into a consistent OpenAI-compatible format:
-
-```json
-{
-  "error": {
-    "message": "description of what went wrong",
-    "type": "error_type",
-    "param": null,
-    "code": null
-  }
-}
-```
-
-| Error type | HTTP status | Typical cause |
-|---|---|---|
-| `invalid_request_error` | 400 | Malformed request, missing model, provider not configured |
-| `authentication_error` | 401 | Invalid API key |
-| `permission_error` | 403 | Insufficient permissions |
-| `not_found_error` | 404 | Model not found |
-| `rate_limit_error` | 429 | Upstream rate limit hit |
-| `server_error` | 500 | Provider returned an unexpected error |
-| `service_unavailable` | 503 | Provider is down |
diff --git a/unified-doc/docs/llm-gateway/semantic-routing.md b/unified-doc/docs/llm-gateway/semantic-routing.md
deleted file mode 100644
index d7c368d7..00000000
--- a/unified-doc/docs/llm-gateway/semantic-routing.md
+++ /dev/null
@@ -1,347 +0,0 @@
----
-title: Semantic routing
-sidebar_label: Semantic routing
----
-
-# Semantic routing
-
-When a request arrives without a `model` field (or with `model: auto`), the gateway uses semantic
-routing to decide which backend model should handle it. Routing uses a three-layer cascade: fast
-heuristic rules are tried first, then embedding-based similarity, then an LLM classifier. Each layer
-can either make a confident decision or pass to the next. If no layer produces a match, the request
-falls back to a configured default route.
-
-## The routing cascade
-
-The router evaluates layers in order and stops at the first confident result:
-
-```
-Request arrives
-    |
-    v
-1. Explicit model? ──yes──> use that model (bypass routing)
-    │no
-    v
-2. Heuristics match? ──yes──> use matched route
-    │no
-    v
-3. Embeddings match?
-    ├─ confident (>= threshold) ──> use matched route
-    ├─ ambiguous (>= ambiguous_threshold but < threshold) ──> escalate to classifier
-    └─ no match
-    v
-4. Classifier match? ──yes──> use classified route
-    │no
-    v
-5. Default route
-```
-
-Each step appends to a **cascade log** visible in the gateway's output:
-
-```
-semantic routing: method=semantic route='coding' model='claude-haiku-4-5-20251001'
-  confidence=0.87 latency=12ms cascade=[heuristic:no_match,semantic:coding:0.87]
-```
-
-### Explicit model passthrough
-
-If the client sends a `model` field and `allow_explicit_model` is `true` (the default), the router uses
-that model directly without evaluating any layers:
-
-```yaml
-routing:
-  allow_explicit_model: true  # default; set false to force all requests through routing
-```
-
-### The `auto` virtual model
-
-Clients that always require a `model` field (such as Open WebUI) can send `model: auto`. The gateway
-clears this to an empty string before routing, which triggers the full cascade. When semantic routing
-is enabled, `auto` appears in the `/v1/models` endpoint so clients can discover it.
-
-## Routes
-
-A route maps a name to a backend model and provides context for the embedding and classifier layers:
-
-```yaml
-routes:
-  - name: coding
-    model: claude-haiku-4-5-20251001
-    description: "code generation, debugging, code review, and technical programming tasks"
-    examples:
-      - "write a python function to sort a list"
-      - "debug this segfault in my C code"
-      - "review this pull request for bugs"
-      - "implement a binary search tree in Go"
-```
-
-Each field serves a specific role across the routing layers:
-
-| Field | Used by | Purpose |
-|---|---|---|
-| `name` | All layers | Identifier for heuristic rules, cascade logs, and classifier output |
-| `model` | All layers | The backend model to use when this route is selected |
-| `description` | Classifier | Included in the classifier prompt |
-| `examples` | Embeddings | Converted to vectors at startup for similarity matching |
-
-## Layer 1: Heuristics
-
-Heuristics are fast, deterministic rules evaluated before any model calls:
-
-```yaml
-heuristics:
-  enabled: true
-  rules:
-    - match:
-        keywords: ["translate", "translation"]
-      route: general
-    - match:
-        has_tools: true
-      route: tools
-    - match:
-        system_prompt_contains: "you are a code assistant"
-      route: coding
-    - match:
-        max_tokens_lt: 100
-        message_length_lt: 200
-      route: fast
-```
-
-Rules are evaluated in order. The first matching rule wins. All conditions within a rule must be true
-(AND logic).
-
-### Match conditions
-
-These conditions can appear in a match block:
-
-- **`keywords`**: Matched against user messages with word boundaries, case-insensitive. Any single
-  keyword matching is sufficient.
-- **`exclude`**: Phrases that suppress a keyword match. If any exclusion phrase is found, the rule
-  doesn't match. Useful for filtering out boilerplate text injected by clients like Open WebUI.
-- **`system_prompt_contains`**: A substring matched against any system message, case-insensitive.
-- **`max_tokens_lt`**: Matches if `max_tokens` is set and strictly less than the given value.
-- **`message_length_lt`**: Matches if the total character count across all messages is strictly less
-  than the given value.
-- **`has_tools`**: Matches if the request includes tool definitions (`true`) or does not (`false`).
-
-### Exclusions
-
-When using broad keywords, you may encounter false positives from boilerplate text injected by clients:
-
-```yaml
-- match:
-    keywords: ["code", "debug", "refactor"]
-    exclude: ["code fences", "code block", "### Task"]
-  route: coding
-```
-
-Exclusions are checked first. If any exclusion phrase matches, the rule is skipped entirely.
-
-## Layer 2: Embeddings
-
-The embedding layer converts text into numerical vectors and uses cosine similarity to find the closest
-route.
-
-At startup, each route's example prompts are embedded and stored in memory. When a request arrives, the
-last user message is embedded and compared against each route's stored vectors. Messages longer than
-2048 characters are truncated before embedding.
-
-### Configuration
-
-Set the following options under `semantic:` in your routing config:
-
-```yaml
-semantic:
-  enabled: true
-  provider: local           # local or openai
-  model: nomic-embed-text   # embedding model name
-  threshold: 0.75           # minimum similarity for a confident match
-  ambiguous_threshold: 0.5  # below threshold but above this → escalate to classifier
-  comparison: centroid      # centroid, max, or average
-  cache_embeddings: true    # cache prompt embeddings to avoid repeated calls
-  cache_ttl: 3600           # cache entry lifetime in seconds (default: 3600)
-  cache_size: 1000          # maximum cache entries (default: 1000)
-```
-
-### Comparison modes
-
-Three modes control how the embedding layer compares a request against stored route examples:
-
-- **`centroid`** (default): Averages all example embeddings into a single vector per route. Fastest.
-  Works well when examples cluster around a common theme.
-- **`max`**: Compares against every example individually and uses the highest score. Good when a route
-  covers several distinct sub-topics. More prone to false positives.
-- **`average`**: Compares against every example individually and uses the mean score. Balanced between
-  `centroid` and `max`.
-
-| Situation | Recommended mode |
-|---|---|
-| Examples per route are similar to each other | `centroid` |
-| A route covers several distinct sub-topics | `max` |
-| You want balanced "generally like this route" scoring | `average` |
-
-### Thresholds
-
-```
-score >= threshold                            → confident match, return immediately
-ambiguous_threshold <= score < threshold      → ambiguous, escalate to classifier
-score < ambiguous_threshold                   → no match, continue to next layer
-```
-
-The right values depend on your embedding model and route structure. Models like `nomic-embed-text`
-tend to produce higher similarity scores, so you may need higher thresholds (0.7–0.85 for `threshold`,
-0.4–0.6 for `ambiguous_threshold`).
-
-### Embedding cache
-
-When `cache_embeddings` is true, prompt embeddings are cached in an LRU cache keyed by a SHA-256 hash
-of the prompt text. `cache_size` controls capacity (evicts least recently used when full).
-
-## Layer 3: LLM classifier
-
-The classifier sends the user's prompt to a chat model and asks it to classify the request into one of
-the configured routes. It's typically used as a fallback for ambiguous embedding results but can also
-run standalone.
-
-### Configuration
-
-Set the following options under `classifier:` in your routing config:
-
-```yaml
-classifier:
-  enabled: true
-  provider: local            # local or openai
-  model: qwen3-vl:30b
-  timeout_ms: 10000          # request timeout in milliseconds (0 = no timeout)
-  confidence_threshold: 0.7  # minimum confidence to accept the classification
-  cache_results: true
-  cache_ttl: 3600
-  cache_size: 500
-```
-
-### When the classifier runs
-
-The classifier is invoked when:
-
-- The embedding layer found a route but the score was between `ambiguous_threshold` and `threshold`.
-- Embeddings are disabled and heuristics found no match.
-
-The classifier's result is accepted only if the confidence meets or exceeds `confidence_threshold`.
-
-### Route descriptions matter
-
-The classifier relies on the `description` field to understand what each route represents. Write
-descriptions that are specific enough for an LLM to distinguish between routes — vague descriptions
-produce poor classifications.
-
-The classifier's response may be wrapped in markdown code blocks. The gateway strips those
-automatically before parsing the result.
-
-## Default route
-
-If no layer produces a confident result, the gateway uses:
-
-```yaml
-routing:
-  default_route: general
-```
-
-If `default_route` isn't set, the first route in the `routes` list is the absolute fallback.
-
-## Example configuration
-
-A minimal setup using only the embedding layer, with two routes:
-
-```yaml
-routing:
-  default_route: general
-
-  semantic:
-    enabled: true
-    provider: local
-    model: nomic-embed-text
-    threshold: 0.75
-    ambiguous_threshold: 0.5
-
-  routes:
-    - name: coding
-      model: claude-haiku-4-5-20251001
-      description: "code generation, debugging, and technical tasks"
-      examples:
-        - "write a python function to sort a list"
-        - "debug this segfault in my C code"
-
-    - name: general
-      model: qwen3-vl:30b
-      description: "general knowledge and conversation"
-      examples:
-        - "what is the capital of France"
-        - "explain how photosynthesis works"
-```
-
-## Full configuration reference
-
-All routing options in a single block with defaults shown:
-
-```yaml
-routing:
-  allow_explicit_model: true
-  default_route: general
-
-  heuristics:
-    enabled: true
-    rules:
-      - match:
-          keywords: [...]
-          exclude: [...]
-          system_prompt_contains: "..."
-          max_tokens_lt: 100
-          message_length_lt: 200
-          has_tools: true
-        route: route_name
-
-  semantic:
-    enabled: true
-    provider: local
-    model: nomic-embed-text
-    threshold: 0.75
-    ambiguous_threshold: 0.5
-    comparison: centroid
-    cache_embeddings: false   # default: false
-    cache_ttl: 3600
-    cache_size: 1000
-
-  classifier:
-    enabled: true
-    provider: local
-    model: qwen3-vl:30b
-    timeout_ms: 0             # default: 0 (no timeout)
-    confidence_threshold: 0   # default: 0
-    cache_results: false      # default: false
-    cache_ttl: 3600
-    cache_size: 500
-
-  routes:
-    - name: coding
-      model: claude-haiku-4-5-20251001
-      description: "code generation, debugging, and technical tasks"
-      examples:
-        - "write a python function to sort a list"
-        - "debug this segfault in my C code"
-```
-
-## Tuning tips
-
-A few principles for getting good routing results:
-
-- **Start simple.** Enable only the embedding layer with a few well-chosen examples per route. Add
-  heuristics and the classifier later if needed.
-- **Add more examples before switching comparison modes.** Four well-chosen examples often solve
-  problems that changing `comparison` won't.
-- **Keep examples realistic.** Use prompts that look like what users actually send.
-- **Use heuristics for obvious cases.** If every request containing "translate" should go to the same
-  route, a keyword heuristic is faster and more reliable than embedding similarity.
-- **Watch the cascade logs.** The gateway logs the full cascade for every routed request. This is the
-  best way to understand why a request was routed where it was.
-- **Use metrics for aggregate tuning.** A high proportion of `default` decisions suggests your
-  thresholds are too strict or your examples don't cover your traffic well.
diff --git a/unified-doc/docs/llm-gateway/streaming.md b/unified-doc/docs/llm-gateway/streaming.md
deleted file mode 100644
index 519cd81a..00000000
--- a/unified-doc/docs/llm-gateway/streaming.md
+++ /dev/null
@@ -1,104 +0,0 @@
----
-title: Streaming
-sidebar_label: Streaming
----
-
-# Streaming
-
-All providers support streaming chat completions via Server-Sent Events (SSE).
-
-## How the gateway handles streaming
-
-When the client sends `"stream": true`, the gateway:
-
-1. Sends the request to the upstream provider with streaming enabled.
-2. Sets SSE response headers (`Content-Type: text/event-stream`, `Cache-Control: no-cache`,
-   `X-Accel-Buffering: no`).
-3. Reads chunks from the provider as they arrive.
-4. Writes each chunk as a `data: {json}\n\n` SSE event and flushes immediately.
-5. Sends `data: [DONE]\n\n` when the stream completes.
-
-## Send a streaming request
-
-### curl
-
-Include `"stream": true` in your request to receive incremental token output:
-
-```bash
-curl -X POST http://localhost:8080/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "gpt-4",
-    "messages": [{"role": "user", "content": "Explain quantum entanglement"}],
-    "stream": true
-  }'
-```
-
-### Python
-
-Use the OpenAI Python client with `stream=True`:
-
-```python
-from openai import OpenAI
-
-client = OpenAI(base_url="http://localhost:8080/v1", api_key="not-needed")
-
-stream = client.chat.completions.create(
-    model="claude-sonnet-4-20250514",
-    messages=[{"role": "user", "content": "Write a haiku"}],
-    stream=True,
-)
-
-for chunk in stream:
-    if chunk.choices[0].delta.content:
-        print(chunk.choices[0].delta.content, end="")
-```
-
-## Response format
-
-The gateway returns a series of SSE events. Each chunk follows the OpenAI format:
-
-```
-data: {"id":"chatcmpl-abc","object":"chat.completion.chunk","choices":[{"delta":{"content":"Quantum"},"index":0}]}
-
-data: {"id":"chatcmpl-abc","object":"chat.completion.chunk","choices":[{"delta":{"content":" entanglement"},"index":0}]}
-
-data: [DONE]
-```
-
-Each `delta` field contains only the incremental content for that chunk. Clients must accumulate chunks
-to reconstruct the full message.
-
-## Response headers
-
-The gateway sets these headers on streaming responses:
-
-| Header | Value | Purpose |
-|---|---|---|
-| `Content-Type` | `text/event-stream` | identifies the response as SSE |
-| `Cache-Control` | `no-cache` | prevents caching of the stream |
-| `Connection` | `keep-alive` | keeps the connection open |
-| `X-Accel-Buffering` | `no` | disables nginx buffering so chunks reach clients immediately |
-
-## Provider differences
-
-**OpenAI and local (Ollama, vLLM, etc.)** — these already produce OpenAI-format SSE streams. The gateway
-forwards them directly to the client.
-
-**Anthropic** — uses a different event protocol. The gateway translates on the fly:
-
-| Anthropic event | Gateway action |
-|---|---|
-| `message_start` | captures message ID for subsequent chunks |
-| `content_block_delta` | emitted as an OpenAI-format `chat.completion.chunk` |
-| `message_delta` | emitted as a chunk with `finish_reason` (`end_turn` → `stop`) |
-| `message_stop` | emitted as `data: [DONE]` |
-
-Translation is transparent — clients receive the same format regardless of which provider handled the
-request.
-
-## Error handling
-
-If an error occurs before streaming starts, the gateway returns a standard JSON error response. If an
-error occurs mid-stream (after the SSE connection is established), it's sent as an SSE event containing
-an error JSON object before the connection closes.
diff --git a/unified-doc/docs/mcp-gateway/common-servers.md b/unified-doc/docs/mcp-gateway/common-servers.md
deleted file mode 100644
index 0443304f..00000000
--- a/unified-doc/docs/mcp-gateway/common-servers.md
+++ /dev/null
@@ -1,19 +0,0 @@
----
-title: Common MCP servers
-sidebar_label: Common MCP servers
----
-
-# Common MCP servers
-
-The following MCP servers are available from the official `@modelcontextprotocol` npm scope. Install
-any of these with `npx -y @modelcontextprotocol/server-<name>`. Any `stdio` MCP server works with
-`mcp-bridge` and `mcp-gateway` regardless of language or runtime.
-
-| Package | Purpose |
-|---------|---------|
-| `@modelcontextprotocol/server-filesystem` | File operations |
-| `@modelcontextprotocol/server-github` | GitHub integration |
-| `@modelcontextprotocol/server-fetch` | Web content fetching |
-| `@modelcontextprotocol/server-memory` | Knowledge graph memory |
-| `@modelcontextprotocol/server-postgres` | PostgreSQL queries |
-| `@modelcontextprotocol/server-sqlite` | SQLite database |
diff --git a/unified-doc/docs/mcp-gateway/configuration.md b/unified-doc/docs/mcp-gateway/configuration.md
deleted file mode 100644
index 17a954a9..00000000
--- a/unified-doc/docs/mcp-gateway/configuration.md
+++ /dev/null
@@ -1,208 +0,0 @@
----
-title: Configuration reference
-sidebar_label: Configuration
----
-
-# Configuration reference
-
-The gateway is configured with a YAML file passed to `mcp-gateway run`. This page covers every
-top-level key and option.
-
-## Top-level structure
-
-A complete config file looks like this:
-
-```yaml
-share_token: "my-gateway"    # optional — see Persistent shares
-
-aggregator:
-  name: "my-gateway"
-  version: "1.0.0"
-  separator: ":"
-  connection:
-    connect_timeout: 30s
-    call_timeout: 60s
-
-backends:
-  - id: my-backend
-    transport:
-      type: stdio
-      command: my-command
-      args: ["arg1"]
-      env:
-        MY_VAR: "${MY_VAR}"
-    tools:
-      mode: allow
-      list:
-        - "tool_name"
-```
-
-## Aggregator settings
-
-The `aggregator` block configures the gateway's identity and connection behavior:
-
-- **`name`**: Gateway name, returned in tool-list responses.
-- **`version`**: Gateway version, returned in tool-list responses.
-- **`separator`**: Character used to namespace tool names (default: `_`). See [Tool namespacing](#tool-namespacing).
-- **`connection.connect_timeout`**: Time to wait when connecting to a backend (default: `30s`).
-- **`connection.call_timeout`**: Time to wait for a tool call to complete (default: `60s`).
-
-## Backends
-
-Each entry in the `backends` list defines one backend MCP server. Every backend requires an `id`
-and a `transport` block.
-
-### Tool namespacing
-
-The backend `id` is used as the namespace prefix for every tool the backend exposes, combined with
-the `separator` set in the `aggregator` block:
-
-| Backend | Original tool | Namespaced tool |
-|---------|---------------|-----------------|
-| docs | `read_file` | `docs:read_file` |
-| docs | `write_file` | `docs:write_file` |
-| data | `read_file` | `data:read_file` |
-
-Common separator choices:
-
-| Separator | Example | Notes |
-|-----------|---------|-------|
-| `_` (default) | `docs_read_file` | Blends in with snake_case names |
-| `:` | `docs:read_file` | Visually distinct |
-| `-` | `docs-read_file` | Can be ambiguous with hyphenated tool names |
-
-### Transport types
-
-- **`stdio`**: Spawns a local process and communicates over stdin/stdout. Use the `env` map to pass
-  environment variables to the process; values support `${VAR}` substitution from the shell
-  environment:
-
-  ```yaml
-  transport:
-    type: stdio
-    command: mcp-filesystem
-    args: ["~/Documents"]
-    env:
-      GITHUB_TOKEN: "${GITHUB_TOKEN}"
-  ```
-
-- **`zrok`**: Connects to a remote bridge over the zrok overlay:
-
-  ```yaml
-  transport:
-    type: zrok
-    share_token: "remote-token"
-  ```
-
-- **`https`**: Connects to a remote MCP server over HTTPS. Only accepts `https://` endpoints.
-  Supports SSE (default) or streamable HTTP transport, with optional custom headers and TLS
-  configuration.
-
-  With custom headers:
-
-  ```yaml
-  transport:
-    type: https
-    endpoint: "https://mcp.example.com/sse"
-    headers:
-      Authorization: "Bearer sk-abc123"
-  ```
-
-  With a custom CA cert and streamable HTTP protocol:
-
-  ```yaml
-  transport:
-    type: https
-    endpoint: "https://mcp.internal.corp/mcp"
-    protocol: "streamable"
-    tls:
-      ca_cert_file: "/etc/ssl/certs/internal-ca.pem"
-  ```
-
-- **`http`**: Connects to a remote MCP server over HTTP or HTTPS. Unlike `https`, accepts both
-  `http://` and `https://` endpoints, but plaintext HTTP requires explicit opt-in:
-
-  ```yaml
-  transport:
-    type: http
-    endpoint: "http://localhost:8080/sse"
-    allow_insecure: true
-  ```
-
-### Tool filtering
-
-By default, every tool from every backend is exposed. Use allow or deny lists to control this
-per backend.
-
-**Allow mode**: Only expose tools that match:
-
-```yaml
-tools:
-  mode: allow
-  list:
-    - "read_file"
-    - "list_directory"
-```
-
-**Deny mode**: Expose everything except tools that match:
-
-```yaml
-tools:
-  mode: deny
-  list:
-    - "write_file"
-```
-
-**Glob patterns**: `*` matches any sequence of characters, `?` matches a single character:
-
-| Pattern | Matches |
-|---------|---------|
-| `read_file` | Exactly `read_file` |
-| `read_*` | `read_file`, `read_dir`, ... |
-| `*file` | `read_file`, `write_file` |
-| `*` | Everything |
-
-Omit the `tools` section entirely to expose all tools.
-
-## Example: Multi-backend configuration
-
-A three-backend setup combining filesystem access, GitHub, and web fetching:
-
-```yaml
-aggregator:
-  name: "my-dev-tools"
-  version: "1.0.0"
-  separator: ":"
-
-backends:
-  - id: filesystem
-    transport:
-      type: stdio
-      command: npx
-      args: ["-y", "@modelcontextprotocol/server-filesystem", "~/Documents"]
-    tools:
-      mode: allow
-      list:
-        - "read_file"
-        - "list_directory"
-        - "search_files"
-
-  - id: github
-    transport:
-      type: stdio
-      command: npx
-      args: ["-y", "@modelcontextprotocol/server-github"]
-      env:
-        GITHUB_TOKEN: "${GITHUB_TOKEN}"
-    tools:
-      mode: deny
-      list:
-        - "delete_*"
-        - "force_*"
-
-  - id: fetch
-    transport:
-      type: stdio
-      command: npx
-      args: ["-y", "@modelcontextprotocol/server-fetch"]
-```
diff --git a/unified-doc/docs/mcp-gateway/get-started.md b/unified-doc/docs/mcp-gateway/get-started.md
deleted file mode 100644
index ed528ab3..00000000
--- a/unified-doc/docs/mcp-gateway/get-started.md
+++ /dev/null
@@ -1,295 +0,0 @@
----
-title: Get started with NetFoundry MCP Gateway
-sidebar_label: Get started
----
-
-# Get started
-
-This guide walks you through NetFoundry MCP Gateway from scratch. You'll start with the simplest possible
-setup — a single MCP server exposed over the network — and build up to a full multi-backend gateway
-with tool filtering and namespacing.
-
-## Prerequisites
-
-Before you begin, you need:
-
-- **Go 1.25.4+**: For building from source.
-- **A zrok v2.0.x account**: Sign up for free at [zrok.io](https://zrok.io) or follow the
-  `zrok2 invite` instructions below.
-
-## Part 1: Enable zrok
-
-NetFoundry MCP Gateway uses [zrok](https://zrok.io) for secure, zero-trust networking. All traffic between
-components travels over an OpenZiti overlay network — nothing is ever exposed on a public IP.
-
-If you already have a zrok v1.x account on zrok.io, the same account token works for enabling a
-v2.x environment; the new environment ends up in `~/.zrok2` and appears in your account overview.
-
-### Request an account
-
-```bash
-zrok2 invite
-```
-
-Enter your email address. You'll receive an invitation email with your account token.
-
-### Install zrok
-
-Download the `zrok2` binary (v2.0.0-rc7 or later) for your platform from the
-[releases page](https://github.com/openziti/zrok/releases/tag/v2.0.0-rc7).
-
-### Enable your environment
-
-```bash
-zrok2 enable <your-token>
-zrok2 status
-```
-
-## Part 2: Your first MCP server (mcp-bridge + mcp-tools)
-
-The simplest setup uses two components:
-
-- **`mcp-bridge`**: Takes a local stdio MCP server and makes it available over the overlay.
-- **`mcp-tools`**: Connects to a remote share and bridges it back to stdio.
-
-Together they let any MCP client talk to an MCP server running anywhere, without opening ports or
-configuring firewalls.
-
-### Install
-
-Install all components with a single command:
-
-```bash
-go install github.com/openziti/mcp-gateway/cmd/...@latest
-```
-
-This installs all components: `mcp-gateway`, `mcp-bridge`, `mcp-tools`, and `mcp-filesystem` (a
-sandboxed filesystem server included for getting started).
-
-### Build from source
-
-Clone the repository and build each binary individually:
-
-```bash
-git clone https://github.com/openziti/mcp-gateway.git
-cd mcp-gateway
-go build ./cmd/mcp-gateway
-go build ./cmd/mcp-bridge
-go build ./cmd/mcp-tools
-```
-
-### Start the bridge
-
-```bash
-mcp-bridge mcp-filesystem ~/Documents
-```
-
-The bridge spawns `mcp-filesystem ~/Documents`, creates a zrok private share, and prints the share
-token:
-
-```json
-{"share_token":"a1b2c3d4e5f6"}
-```
-
-The share token is the only thing needed to connect. There's no IP address, no port, no DNS name —
-the server is a "dark service" that doesn't listen on any network interface. Keep this terminal
-running.
-
-### Connect with mcp-tools
-
-In a second terminal:
-
-```bash
-mcp-tools run a1b2c3d4e5f6
-```
-
-`mcp-tools run` connects to the zrok share and bridges it to stdin/stdout. Any MCP client that speaks
-stdio can use this as its transport.
-
-### Configure Claude Desktop
-
-Add the share to Claude Desktop's config file:
-
-| Platform | Path |
-|----------|------|
-| macOS | `~/Library/Application Support/Claude/claude_desktop_config.json` |
-| Windows | `%APPDATA%\Claude\claude_desktop_config.json` |
-| Linux | `~/.config/Claude/claude_desktop_config.json` |
-
-Add the server entry:
-
-```json
-{
-  "mcpServers": {
-    "filesystem": {
-      "command": "mcp-tools",
-      "args": ["run", "a1b2c3d4e5f6"]
-    }
-  }
-}
-```
-
-Restart Claude Desktop. The `read_file`, `write_file`, and `list_directory` tools will be available.
-
-## Part 3: Aggregate multiple servers (mcp-gateway)
-
-`mcp-gateway` aggregates multiple backends and serves them all through a single zrok share.
-
-### Create a configuration file
-
-Create `gateway-config.yml`:
-
-```yaml
-aggregator:
-  name: "my-gateway"
-  version: "1.0.0"
-  separator: ":"
-
-backends:
-  - id: docs
-    transport:
-      type: stdio
-      command: mcp-filesystem
-      args: ["~/Documents"]
-
-  - id: data
-    transport:
-      type: stdio
-      command: mcp-filesystem
-      args: ["~/Data"]
-    tools:
-      mode: allow
-      list:
-        - "read_file"
-        - "list_directory"
-```
-
-### Start the gateway
-
-Pass the config file path as the argument:
-
-```bash
-mcp-gateway run gateway-config.yml
-```
-
-```text title="Output"
-{"share_token":"x9y8z7w6v5u4"}
-```
-
-Connect the same way:
-
-```bash
-mcp-tools run x9y8z7w6v5u4
-```
-
-The available tools are now namespaced by backend ID:
-
-| Tool | Source |
-|------|--------|
-| `docs:read_file` | docs backend |
-| `docs:write_file` | docs backend |
-| `docs:list_directory` | docs backend |
-| `data:read_file` | data backend (filtered to read-only) |
-| `data:list_directory` | data backend (filtered to read-only) |
-
-`data:write_file` is absent because the allow list on the `data` backend only includes
-`read_file` and `list_directory`. See [Configuration](configuration.md) for the full list of
-aggregator settings, transport types, filtering options, and environment variable syntax.
-
-## Part 4: Connect remote servers
-
-You can connect to MCP servers running on other machines using `mcp-bridge` with the `zrok` transport.
-
-### Run a bridge on a remote machine
-
-```bash
-mcp-bridge mcp-filesystem /data
-```
-
-```text title="Output"
-{"share_token":"remote-token"}
-```
-
-### Add as a gateway backend
-
-Reference the remote bridge's share token under a `zrok` transport:
-
-```yaml
-backends:
-  - id: local
-    transport:
-      type: stdio
-      command: mcp-filesystem
-      args: ["~/Documents"]
-
-  - id: remote
-    transport:
-      type: zrok
-      share_token: "remote-token"
-```
-
-The gateway connects over the zrok overlay — no ports to open, no firewall rules. The remote
-backend's tools are namespaced and filtered like any other backend.
-
-Gateways can chain freely: a gateway backend can point to another gateway's share, or to a bridge
-running anywhere on the network.
-
-## Part 5: Connect to your agent
-
-### Claude Desktop (stdio)
-
-Add the gateway share to Claude Desktop's config:
-
-```json
-{
-  "mcpServers": {
-    "my-tools": {
-      "command": "mcp-tools",
-      "args": ["run", "x9y8z7w6v5u4"]
-    }
-  }
-}
-```
-
-### HTTP mode
-
-For agents or clients that expect an HTTP endpoint:
-
-```bash
-mcp-tools http x9y8z7w6v5u4 --bind 127.0.0.1:8080
-```
-
-Options:
-
-- **`--bind`**: Address to listen on (default: `127.0.0.1:8080`)
-- **`--stateless`**: No session persistence
-- **`--json-response`**: Prefer JSON responses over SSE streams
-
-Any MCP client that supports stdio transport can use `mcp-tools run <token>` directly. For HTTP-based
-clients, use `mcp-tools http`.
-
-**n8n example:** Configure the n8n MCP Client Tool:
-
-- **URL**: `http://127.0.0.1:8080`
-- **Transport**: SSE (default) or streamable HTTP
-
-## Troubleshooting
-
-- **"zrok enable" required**: Run `zrok2 enable` with your account token first.
-
-- **Backend connection failures**: Check that stdio commands are correct and executables are in PATH.
-  For zrok backends, verify the share token is valid and the remote bridge is running.
-
-- **Tool not found**: Check the namespace prefix matches the backend ID. Verify the tool isn't filtered
-  by your allow/deny list.
-
-- **Debug logging**: Set `PFXLOG_LEVEL=debug` for verbose output:
-
-  ```bash
-  PFXLOG_LEVEL=debug mcp-gateway run config.yml
-  ```
-
-## What's next
-
-- [Configuration](configuration.md): Full reference for every aggregator, backend, and filtering option.
-- [Persistent shares](persistent-shares.md): Create share tokens that survive restarts for production deployments.
diff --git a/unified-doc/docs/mcp-gateway/intro.md b/unified-doc/docs/mcp-gateway/intro.md
deleted file mode 100644
index ef4d93df..00000000
--- a/unified-doc/docs/mcp-gateway/intro.md
+++ /dev/null
@@ -1,59 +0,0 @@
----
-title: NetFoundry MCP Gateway overview
-sidebar_label: Overview
-description: >
-  NetFoundry MCP Gateway enables secure, isolated access to Model Context Protocol (MCP) tools across
-  distributed systems without exposing public endpoints.
----
-
-# NetFoundry MCP Gateway overview
-
-NetFoundry MCP Gateway enables secure, isolated access to Model Context Protocol (MCP) tools across
-distributed systems without exposing public endpoints. The project is open source and can be found at:
-[github.com/openziti/mcp-gateway](https://github.com/openziti/mcp-gateway).
-
-## The problem it solves
-
-MCP servers typically run locally via stdio. To access tools on remote machines or share them across
-a team, you'd normally need to expose endpoints — creating security risks. NetFoundry MCP Gateway solves this
-by running everything over OpenZiti's overlay network, so services never listen on public IPs and
-require cryptographic identity to access.
-
-## Components
-
-NetFoundry MCP Gateway consists of three tools that can be used independently or together:
-
-- **`mcp-bridge`**: Takes a local stdio MCP server and exposes it over a zrok private share on the
-  overlay network.
-- **`mcp-gateway`**: Aggregates multiple backends (stdio, HTTP, or other bridges) into a single
-  secure zrok share, with namespacing and tool filtering.
-- **`mcp-tools`**: Connects an MCP client to a remote share, bridging it back to stdio or a local
-  HTTP endpoint.
-
-## How it works
-
-All traffic between components travels over an OpenZiti overlay network via [zrok](https://zrok.io).
-Nothing is ever exposed on a public IP. Only authorized parties with a valid zrok environment can
-connect to a share. This model functions transparently through NATs and firewalls.
-
-The gateway creates an isolated session for each connecting client. Each client gets dedicated
-backend connections — no shared state, no cross-talk between sessions.
-
-## Quick example
-
-1. Share a local MCP server over the overlay:
-
-    ```bash
-    mcp-bridge mcp-filesystem ~/Documents
-    ```
-
-    ```text title="Output"
-    {"share_token":"a1b2c3d4e5f6"}
-    ```
-
-2. Connect to it from anywhere with a zrok-enabled environment:
-
-    ```bash
-    mcp-tools run a1b2c3d4e5f6
-    ```
-
diff --git a/unified-doc/docs/mcp-gateway/persistent-shares.md b/unified-doc/docs/mcp-gateway/persistent-shares.md
deleted file mode 100644
index 7be183b2..00000000
--- a/unified-doc/docs/mcp-gateway/persistent-shares.md
+++ /dev/null
@@ -1,42 +0,0 @@
----
-title: Use persistent shares
-sidebar_label: Persistent shares
----
-
-# Use persistent shares
-
-By default, share tokens are ephemeral — they disappear when the process exits. For production use,
-create persistent shares that survive restarts and keep a stable token.
-
-1. Create a persistent share by running this once to reserve a named token:
-
-    ```bash
-    zrok2 create share my-gateway
-    ```
-
-    Share names must be 3–32 characters, lowercase alphanumeric and hyphens (`[a-z0-9-]`). If you omit
-    the name, zrok generates a random token.
-
-2. Reference the token in your config:
-
-    **mcp-gateway** — set `share_token` at the top level:
-
-    ```yaml
-    share_token: "my-gateway"
-
-    aggregator:
-      name: "my-dev-tools"
-      version: "1.0.0"
-    ```
-
-    **mcp-bridge** — pass `--share-token`:
-
-    ```bash
-    mcp-bridge --share-token my-bridge mcp-filesystem ~/Documents
-    ```
-
-3. When you no longer need the share, delete it:
-
-    ```bash
-    zrok2 delete share my-gateway
-    ```
diff --git a/unified-doc/docusaurus.config.ts b/unified-doc/docusaurus.config.ts
index 9474f0e5..2702e0c7 100644
--- a/unified-doc/docusaurus.config.ts
+++ b/unified-doc/docusaurus.config.ts
@@ -26,6 +26,8 @@ import {onpremRedirects} from "./_remotes/selfhosted/docusaurus/docusaurus-plugi
 import {platformDocsPluginConfig} from "./_remotes/platform/docusaurus/docusaurus-plugin-platform-docs.ts";
 import {openzitiDocsPluginConfig, openzitiRedirects} from "./_remotes/openziti/docusaurus/docusaurus-plugin-openziti-docs.ts";
 import {dataconnectorDocsPluginConfig} from "./_remotes/data-connector/docusaurus/docusaurus-plugin-dataconnector-docs.ts";
+import {llmgatewayDocsPluginConfig} from "./_remotes/llm-gateway/docusaurus/docusaurus-plugin-llmgateway-docs.ts";
+import {mcpgatewayDocsPluginConfig} from "./_remotes/mcp-gateway/docusaurus/docusaurus-plugin-mcpgateway-docs.ts";
 
 // This runs in Node.js - Don't use client-side code here (browser APIs, JSX...)
 const frontdoor = `./_remotes/frontdoor`;
@@ -35,8 +37,8 @@ const zrokRoot = `./_remotes/zrok/website`;
 const zlan = `./_remotes/zlan`;
 const platform = `./_remotes/platform`;
 const dataConnector = `./_remotes/data-connector`;
-const llmGateway = `./docs/llm-gateway`;
-const mcpGateway = `./docs/mcp-gateway`;
+const llmGateway = `./_remotes/llm-gateway`;
+const mcpGateway = `./_remotes/mcp-gateway`;
 
 const isVercel = process.env.IS_VERCEL === 'true';
 const docsBase = isVercel ? '/' : '/docs/';
@@ -353,6 +355,8 @@ const config: Config = {
                                 '@staticdir': path.resolve(__dirname, `docusaurus/static`),
                                 '@platform': path.resolve(__dirname, `${platform}/docusaurus`),
                                 '@dataconnector': path.resolve(__dirname, `${dataConnector}/docusaurus`),
+                                '@llm-gateway': path.resolve(__dirname, `${llmGateway}/docusaurus`),
+                                '@mcp-gateway': path.resolve(__dirname, `${mcpGateway}/docusaurus`),
                             },
                         },
                         module: {
@@ -441,38 +445,16 @@ const config: Config = {
                 routeBase('dataconnector'),
             ),
         ),
-        build(BUILD_FLAGS.LLM_GATEWAY) && [
-            '@docusaurus/plugin-content-docs',
-            {
-                id: 'llm-gateway',
-                path: llmGateway,
-                routeBasePath: routeBase('llm-gateway'),
-                sidebarPath: './sidebars-llm-gateway.ts',
-                beforeDefaultRemarkPlugins: [
-                    remarkGithubAdmonitionsToDirectives,
-                ],
-                remarkPlugins: [
-                    [remarkScopedPath, { mappings: REMARK_MAPPINGS, logLevel: LogLevel.Silent }],
-                    [remarkCodeSections, { logLevel: LogLevel.Silent }],
-                ],
-            },
-        ],
-        build(BUILD_FLAGS.MCP_GATEWAY) && [
-            '@docusaurus/plugin-content-docs',
-            {
-                id: 'mcp-gateway',
-                path: mcpGateway,
-                routeBasePath: routeBase('mcp-gateway'),
-                sidebarPath: './sidebars-mcp-gateway.ts',
-                beforeDefaultRemarkPlugins: [
-                    remarkGithubAdmonitionsToDirectives,
-                ],
-                remarkPlugins: [
-                    [remarkScopedPath, { mappings: REMARK_MAPPINGS, logLevel: LogLevel.Silent }],
-                    [remarkCodeSections, { logLevel: LogLevel.Silent }],
-                ],
-            },
-        ],
+        build(BUILD_FLAGS.LLM_GATEWAY) && llmgatewayDocsPluginConfig(
+            `${llmGateway}/docusaurus`,
+            REMARK_MAPPINGS,
+            routeBase('llm-gateway'),
+        ),
+        build(BUILD_FLAGS.MCP_GATEWAY) && mcpgatewayDocsPluginConfig(
+            `${mcpGateway}/docusaurus`,
+            REMARK_MAPPINGS,
+            routeBase('mcp-gateway'),
+        ),
         ['@docusaurus/plugin-sitemap', { changefreq: "daily", priority: 0.8 }],
         [pluginHotjar, {}],
         [pluginReo, {}],
diff --git a/unified-doc/sidebars-llm-gateway.ts b/unified-doc/sidebars-llm-gateway.ts
deleted file mode 100644
index ad901f71..00000000
--- a/unified-doc/sidebars-llm-gateway.ts
+++ /dev/null
@@ -1,22 +0,0 @@
-import type {SidebarsConfig} from '@docusaurus/plugin-content-docs';
-
-const sidebars: SidebarsConfig = {
-    docsSidebar: [
-        {type: 'html', value: '<span class="menu__link">INTRO</span>', className: 'sidebar-title'},
-        {type: 'doc', id: 'intro'},
-        {type: 'doc', id: 'get-started'},
-        {type: 'html', value: '<span class="menu__link">HOW-TO</span>', className: 'sidebar-title'},
-        {type: 'doc', id: 'connect-zrok'},
-        {type: 'html', value: '<span class="menu__link">LEARN</span>', className: 'sidebar-title'},
-        {type: 'doc', id: 'semantic-routing'},
-        {type: 'doc', id: 'multi-endpoint'},
-        {type: 'html', value: '<span class="menu__link">REFERENCE</span>', className: 'sidebar-title'},
-        {type: 'doc', id: 'configuration'},
-        {type: 'doc', id: 'providers'},
-        {type: 'doc', id: 'api-keys'},
-        {type: 'doc', id: 'streaming'},
-        {type: 'doc', id: 'metrics'},
-    ],
-};
-
-export default sidebars;
diff --git a/unified-doc/sidebars-mcp-gateway.ts b/unified-doc/sidebars-mcp-gateway.ts
deleted file mode 100644
index cb1dc875..00000000
--- a/unified-doc/sidebars-mcp-gateway.ts
+++ /dev/null
@@ -1,16 +0,0 @@
-import type {SidebarsConfig} from '@docusaurus/plugin-content-docs';
-
-const sidebars: SidebarsConfig = {
-    docsSidebar: [
-        {type: 'html', value: '<span class="menu__link">INTRO</span>', className: 'sidebar-title'},
-        {type: 'doc', id: 'intro'},
-        {type: 'doc', id: 'get-started'},
-        {type: 'html', value: '<span class="menu__link">HOW-TO</span>', className: 'sidebar-title'},
-        {type: 'doc', id: 'persistent-shares'},
-        {type: 'html', value: '<span class="menu__link">REFERENCE</span>', className: 'sidebar-title'},
-        {type: 'doc', id: 'configuration'},
-        {type: 'doc', id: 'common-servers'},
-    ],
-};
-
-export default sidebars;