diff --git a/unified-doc/build-docs.sh b/unified-doc/build-docs.sh index 80d5b347..a5cc9fca 100755 --- a/unified-doc/build-docs.sh +++ b/unified-doc/build-docs.sh @@ -17,6 +17,8 @@ # --zlan-branch=BRANCH Branch for netfoundry/zlan (default: main) # --platform-branch=BRANCH Branch for netfoundry/platform-doc (default: main) # --data-connector-branch=BRANCH Branch for netfoundry/nf-data-connector (default: main) +# --llm-gateway-branch=BRANCH Branch for openziti/llm-gateway (default: main) +# --mcp-gateway-branch=BRANCH Branch for openziti/mcp-gateway (default: main) # --clean Wipe _remotes and .docusaurus cache before building # --lint-only Run lint checks only; skip build # --qualifier=VALUE Append VALUE to output dir (e.g. --qualifier=-preview -> build-preview) @@ -34,7 +36,7 @@ # BB_USERNAME Bitbucket username (default: x-token-auth) # DOCUSAURUS_BUILD_MASK Hex bitmask: 0x1=openziti 0x2=frontdoor 0x4=selfhosted # 0x8=zrok 0x10=zlan 0x20=platform -# 0x40=data-connector 0xFF=all (default: 0xFF) +# 0x40=data-connector 0x80=llm-gateway 0x100=mcp-gateway 0x1FF=all (default: 0x1FF) # DOCUSAURUS_PUBLISH_ENV Set to 'prod' to use production Algolia index # NO_MINIFY Set to any value to pass --no-minify to Docusaurus # IS_VERCEL Set to 'true' on Vercel preview deployments @@ -62,6 +64,8 @@ BRANCH_SELFHOSTED="main" BRANCH_ZLAN="main" BRANCH_PLATFORM="main" BRANCH_DATA_CONNECTOR="main" +BRANCH_LLM_GATEWAY="main" +BRANCH_MCP_GATEWAY="main" usage() { sed -n '/^# USAGE/,/^# =====/{ /^# =====/d; s/^# \{0,1\}//; p }' "$0" @@ -76,6 +80,8 @@ while [[ $# -gt 0 ]]; do --zlan-branch=*) BRANCH_ZLAN="${1#*=}"; shift ;; --platform-branch=*) BRANCH_PLATFORM="${1#*=}"; shift ;; --data-connector-branch=*) BRANCH_DATA_CONNECTOR="${1#*=}"; shift ;; + --llm-gateway-branch=*) BRANCH_LLM_GATEWAY="${1#*=}"; shift ;; + --mcp-gateway-branch=*) BRANCH_MCP_GATEWAY="${1#*=}"; shift ;; --ziti-doc-branch) BRANCH_ZITI_DOC="${2:?--ziti-doc-branch requires a value}"; shift 2 ;; --zrok-branch) BRANCH_ZROK="${2:?--zrok-branch requires a value}"; shift 2 ;; --frontdoor-branch) BRANCH_FRONTDOOR="${2:?--frontdoor-branch requires a value}"; shift 2 ;; @@ -83,6 +89,8 @@ while [[ $# -gt 0 ]]; do --zlan-branch) BRANCH_ZLAN="${2:?--zlan-branch requires a value}"; shift 2 ;; --platform-branch) BRANCH_PLATFORM="${2:?--platform-branch requires a value}"; shift 2 ;; --data-connector-branch) BRANCH_DATA_CONNECTOR="${2:?--data-connector-branch requires a value}"; shift 2 ;; + --llm-gateway-branch) BRANCH_LLM_GATEWAY="${2:?--llm-gateway-branch requires a value}"; shift 2 ;; + --mcp-gateway-branch) BRANCH_MCP_GATEWAY="${2:?--mcp-gateway-branch requires a value}"; shift 2 ;; --clean) CLEAN=1; shift ;; --lint-only) LINT_ONLY=1; shift ;; -h|--help) usage; exit 0 ;; @@ -109,6 +117,8 @@ echo " BRANCH_SELFHOSTED='$BRANCH_SELFHOSTED'" echo " BRANCH_ZLAN='$BRANCH_ZLAN'" echo " BRANCH_PLATFORM='$BRANCH_PLATFORM'" echo " BRANCH_DATA_CONNECTOR='$BRANCH_DATA_CONNECTOR'" +echo " BRANCH_LLM_GATEWAY='$BRANCH_LLM_GATEWAY'" +echo " BRANCH_MCP_GATEWAY='$BRANCH_MCP_GATEWAY'" echo " CLEAN=$CLEAN" echo " IS_VERCEL='${IS_VERCEL:-}'" echo " node: $(node --version 2>/dev/null || echo 'not found')" @@ -263,6 +273,8 @@ lint_docs() { "${script_dir}/_remotes/openziti/docusaurus/docs" "${script_dir}/_remotes/platform/docusaurus/docs" "${script_dir}/_remotes/data-connector/docusaurus/docs" + "${script_dir}/_remotes/llm-gateway/docusaurus/docs" + "${script_dir}/_remotes/mcp-gateway/docusaurus/docs" ) # 2. VERIFY FOLDERS @@ -394,6 +406,8 @@ clone_or_update "https://github.com/netfoundry/zlan.git" clone_or_update "https://github.com/openziti/zrok.git" zrok "$BRANCH_ZROK" clone_or_update "https://bitbucket.org/netfoundry/platform-doc.git" platform "$BRANCH_PLATFORM" clone_or_update "https://bitbucket.org/netfoundry/nf-data-connector.git" data-connector "$BRANCH_DATA_CONNECTOR" +clone_or_update "https://github.com/openziti/llm-gateway.git" llm-gateway "$BRANCH_LLM_GATEWAY" +clone_or_update "https://github.com/openziti/mcp-gateway.git" mcp-gateway "$BRANCH_MCP_GATEWAY" echo "Cleaning stale build artifacts from remotes..." find "$script_dir/_remotes" -type d \( -path "*/docusaurus/build" -o -path "*/docusaurus/.docusaurus" -o -path "*/website/build" -o -path "*/website/.docusaurus" \) -exec rm -rf {} + 2>/dev/null || true diff --git a/unified-doc/docs/llm-gateway/api-keys.md b/unified-doc/docs/llm-gateway/api-keys.md deleted file mode 100644 index 690efa6d..00000000 --- a/unified-doc/docs/llm-gateway/api-keys.md +++ /dev/null @@ -1,114 +0,0 @@ ---- -title: Virtual API keys -sidebar_label: Virtual API keys ---- - -# Virtual API keys - -The gateway supports virtual API keys — gateway-issued bearer tokens that identify clients and optionally -restrict what they can access. These are "virtual" because they're not upstream provider keys; they're -managed entirely by the gateway. - -Clients send the key in the standard `Authorization: Bearer ` header, matching the convention used -with OpenAI and other providers. Existing tools (Open WebUI, LiteLLM clients, curl scripts) work without -changes beyond configuring a key. - -## Configuration - -Keys are defined in the gateway config file. Each key has a name (for logging and attribution), a secret -value, and optional constraints: - -```yaml -api_keys: - enabled: true - keys: - - name: alice - key: "sk-gw-abc123..." - allowed_models: ["claude-*", "gpt-*"] - allowed_routes: ["coding", "general"] - - - name: bob - key: "sk-gw-def456..." - allowed_models: ["llama3", "qwen3-vl:*"] - - - name: ci-pipeline - key: "sk-gw-xyz789..." - allowed_models: ["*"] -``` - -When `api_keys` is omitted or `enabled: false`, the gateway operates without authentication — open access. - -Keys support environment variable substitution: - -```yaml -keys: - - name: alice - key: "${ALICE_API_KEY}" -``` - -## Key format - -Keys use the prefix `sk-gw-` to distinguish them from upstream provider keys (OpenAI `sk-`, Anthropic -`sk-ant-`). Generate a key with the CLI: - -```bash -llm-gateway genkey -``` - -```text title="Output" -sk-gw-a1b2c3d4e5f6... -``` - -Keys are stored as plaintext in the config file, consistent with how upstream API keys are stored. - -## Authentication flow - -Every incoming request passes through the auth middleware before reaching any handler: - -``` -Client request - | - v -Auth middleware - |-- /health, /metrics -> pass through (no auth required) - |-- api_keys disabled -> pass through - |-- Authorization header missing -> 401 - |-- Key not recognized -> 401 - |-- Key valid -> attach identity to context, continue - | - v -Existing handler pipeline (unchanged) -``` - -## Model restrictions - -Each key can specify `allowed_models` as a list of glob patterns (e.g., `claude-*` matches any Claude -model). A key with `allowed_models: ["*"]` or no `allowed_models` field has unrestricted access. - -Model permission is checked after the model is fully resolved (including semantic routing) but before -the request is dispatched to a provider. If the resolved model doesn't match any allowed pattern, the -gateway returns 403. - -## Route restrictions - -When semantic routing is enabled, a key can specify `allowed_routes` to limit which semantic routes it -can use. If semantic routing selects a route the key can't access, the gateway returns 403 — it doesn't -silently reroute to a fallback. - -## Error responses - -Errors follow the OpenAI-compatible format: - -| Scenario | Status | Error type | -|---|---|---| -| Missing `Authorization` header | 401 | `authentication_error` | -| Invalid key | 401 | `authentication_error` | -| Model not allowed | 403 | `permission_error` | -| Route not allowed | 403 | `permission_error` | - -## Logging and metrics - -The validated key name is included in: - -- **Semantic routing log lines**: `semantic routing: key='alice' method=semantic route='coding' ...` -- **Request metrics**: `key` label on `llm_gateway.requests` and `llm_gateway.request.duration` diff --git a/unified-doc/docs/llm-gateway/configuration.md b/unified-doc/docs/llm-gateway/configuration.md deleted file mode 100644 index 3e2c6d7f..00000000 --- a/unified-doc/docs/llm-gateway/configuration.md +++ /dev/null @@ -1,194 +0,0 @@ ---- -title: Configuration reference -sidebar_label: Configuration ---- - -# Configuration reference - -NetFoundry LLM Gateway is configured with a YAML file. CLI flags can override individual settings. - -## Gateway settings - -Controls the listen address for the gateway process: - -```yaml -listen: ":8080" # address to listen on (default: :8080) -``` - -To expose the gateway over a zrok overlay instead of a local port, add a top-level `zrok:` block: - -```yaml -zrok: - share: - enabled: false - mode: private - token: "" -``` - -## Providers - -Configure which inference providers the gateway can route to: - -```yaml -providers: - open_ai: - api_key: ${OPENAI_API_KEY} # supports environment variable expansion - - anthropic: - api_key: ${ANTHROPIC_API_KEY} - - local: - base_url: http://localhost:11434 -``` - -## Virtual API keys - -Restrict client access with named keys and per-key model permissions: - -```yaml -api_keys: - enabled: true - keys: - - name: alice - key: ${ALICE_KEY} - allowed_models: ["gpt-*", "claude-*"] - - name: bob - key: ${BOB_KEY} - allowed_models: ["llama*"] -``` - -See [Virtual API keys](api-keys.md) for a full reference. - -## Routing - -Enable semantic routing and define named routes: - -```yaml -routing: - default_route: general - semantic: - enabled: true - provider: local - model: nomic-embed-text - threshold: 0.75 - ambiguous_threshold: 0.5 - routes: - - name: coding - model: claude-haiku-4-5-20251001 - description: "code generation, debugging, and technical tasks" - examples: - - "write a python function to sort a list" -``` - -See [Semantic routing](semantic-routing.md) for a full reference. - -## Metrics - -Expose a Prometheus metrics endpoint: - -```yaml -metrics: - enabled: true -``` - -## Tracing - -Enable request body logging for debugging routing decisions: - -```yaml -tracing: - enabled: true - max_content_length: 200 # max characters per message in log output -``` - -When enabled, each chat completion request is logged with the model, message count, streaming flag, -tool count, and each message's role and truncated content. - -## Environment variables - -String values support `${VAR_NAME}` expansion. Variables are expanded at startup: - -```bash -export OPENAI_API_KEY=sk-... -export ANTHROPIC_API_KEY=sk-ant-... -llm-gateway run config.yaml -``` - -## Complete example - -A full configuration combining all sections: - -```yaml -listen: "0.0.0.0:8080" - -zrok: - share: - enabled: true - token: ${ZROK_SHARE_TOKEN} - -api_keys: - enabled: true - keys: - - name: primary - key: ${PRIMARY_API_KEY} - allowed_models: ["gpt-*", "claude-*", "llama*"] - -providers: - open_ai: - api_key: ${OPENAI_API_KEY} - - anthropic: - api_key: ${ANTHROPIC_API_KEY} - - local: - base_url: http://localhost:11434 - -routing: - default_route: general - semantic: - enabled: true - provider: local - model: nomic-embed-text - threshold: 0.75 - -metrics: - enabled: true -``` - -## Run the gateway - -Pass the config file path as the first argument: - -```bash -llm-gateway run config.yaml -``` - -## CLI flags - -``` -llm-gateway run [flags] - -Flags: - --address string Gateway listen address (e.g., 0.0.0.0:8080) - --zrok Enable zrok share (boolean) - --zrok-mode string Zrok share mode (private or public) - -h, --help Show help -``` - -CLI flags take precedence over the config file. - -## Startup sequence - -When the gateway starts, it: - -1. Loads and parses the YAML config file. -2. Applies any CLI flag overrides. -3. Expands environment variables. -4. Initializes providers (OpenAI, Anthropic, local/self-hosted) in order. -5. Creates the model-to-provider router. -6. Initializes OpenTelemetry metrics (if enabled). -7. Initializes the semantic router (if configured). -8. Starts the HTTP server (local or via zrok share). - -On shutdown (SIGINT/SIGTERM), the gateway closes all providers, deletes ephemeral zrok shares, and -releases zrok access objects before exiting. diff --git a/unified-doc/docs/llm-gateway/connect-zrok.md b/unified-doc/docs/llm-gateway/connect-zrok.md deleted file mode 100644 index 19cf848b..00000000 --- a/unified-doc/docs/llm-gateway/connect-zrok.md +++ /dev/null @@ -1,109 +0,0 @@ ---- -title: Connect via zrok -sidebar_label: Connect via zrok ---- - -# Connect via zrok - -The gateway uses [zrok](https://zrok.io) in two independent ways: - -- **Sharing**: Exposes the gateway over a zrok share so clients can reach it without a public IP or - open ports. -- **Accessing**: Connects to backend providers through zrok shares instead of direct HTTP. - -Both use zrok's overlay network built on [OpenZiti](https://openziti.io). - -## Prerequisites - -The gateway requires a zrok environment on the host machine. If `zrok enable` hasn't been run, the -gateway fails at startup: - -``` -zrok environment is not enabled; run 'zrok enable' first -``` - -This applies to both sharing and accessing. - -## Share the gateway - -Instead of listening on a TCP port, the gateway can serve traffic through a zrok share. Clients connect -to the share token rather than an IP address. - -### Ephemeral shares - -An ephemeral share is created at startup and deleted when the gateway shuts down. - -1. Add the zrok config to `config.yaml`: - - ```yaml - zrok: - share: - enabled: true - mode: private # or public - ``` - - Alternatively, pass flags at runtime: - - ```bash - llm-gateway run config.yaml --zrok --zrok-mode private - ``` - -2. Start the gateway. The share token is logged at startup: - - ``` - serving via zrok share 'abc123def456' - ``` - -3. Give clients the share token to connect. - -**Public mode** creates a share accessible by anyone with the token. **Private mode** (the default) -requires the client to have a zrok environment enabled and creates an access-controlled connection -through the overlay. - -### Persistent shares - -Ephemeral shares get a new token on every restart. For a stable token, create a persistent share with -`zrok reserve` and pass its token to the gateway: - -```yaml -zrok: - share: - enabled: true - token: "abc123" # existing persistent share token -``` - -Persistent shares are always private. The gateway connects to the existing share but doesn't delete it -on shutdown — the share is managed externally. - -## Access providers via zrok - -Any provider can be reached through a zrok share by setting `zrok_share_token` in its config. This is -useful when a provider runs on a different machine that isn't directly reachable over the network but -is connected to the same zrok environment: - -```yaml -providers: - local: - zrok_share_token: "remote-ollama-token" - - anthropic: - api_key: "${ANTHROPIC_API_KEY}" - zrok_share_token: "anthropic-proxy-token" -``` - -### Multi-endpoint - -Each endpoint can independently use zrok or direct HTTP: - -```yaml -providers: - local: - endpoints: - - name: local - base_url: "http://localhost:11434" - - name: remote-gpu - zrok_share_token: "gpu-box-token" -``` - -Each endpoint with a `zrok_share_token` gets its own zrok access and HTTP client. The round-robin -load balancer uses whichever transport is configured per endpoint. diff --git a/unified-doc/docs/llm-gateway/get-started.md b/unified-doc/docs/llm-gateway/get-started.md deleted file mode 100644 index 41ca409f..00000000 --- a/unified-doc/docs/llm-gateway/get-started.md +++ /dev/null @@ -1,363 +0,0 @@ ---- -title: Get started with NetFoundry LLM Gateway -sidebar_label: Get started ---- - -# Get started - -This guide walks you through installing NetFoundry LLM Gateway and running your first requests. By the -end, you'll have the gateway proxying requests to one or more inference providers. - -## Installation - -Choose the installation method that fits your environment. - -### Pre-built binaries - -Pre-built binaries are available for Linux, macOS, and Windows: - -1. Visit the [GitHub Releases](https://github.com/openziti/llm-gateway/releases) page. -2. Download the binary for your platform. -3. Make it executable: - - ```bash - chmod +x llm-gateway - ``` - -4. Run it: - - ```bash - ./llm-gateway run config.yaml - ``` - -### Install with Go - -If you have Go 1.22 or later: - -```bash -go install github.com/openziti/llm-gateway/cmd/llm-gateway@latest -llm-gateway run config.yaml -``` - -### Build from source - -Clone the repository and build the binary locally: - -```bash -git clone https://github.com/openziti/llm-gateway.git -cd llm-gateway -go build -o llm-gateway ./cmd/llm-gateway -./llm-gateway run config.yaml -``` - -## Examples - -The examples below progress from a simple single-provider proxy to a full production configuration. - -### Proxy a local inference server - -1. Start Ollama: - - ```bash - ollama serve - ``` - -2. Create `config.yaml`: - - ```yaml - local: - base_url: http://localhost:11434 - ``` - -3. Start the gateway: - - ```bash - llm-gateway run config.yaml - ``` - -4. Send a request: - - ```bash - curl -X POST http://localhost:8080/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "llama2", - "messages": [{"role": "user", "content": "Hello"}], - "temperature": 0.7 - }' - ``` - -The gateway listens on `http://localhost:8080` by default. - -### Route between OpenAI and Anthropic - -The gateway routes requests to the correct provider by prefix-matching on the model name: - -```yaml -providers: - open_ai: - api_key: ${OPENAI_API_KEY} - - anthropic: - api_key: ${ANTHROPIC_API_KEY} - - local: - base_url: http://localhost:11434 -``` - -Requests are routed automatically based on the model prefix: `gpt-*` goes to OpenAI, `claude-*` to -Anthropic, everything else to the local provider: - -```bash -# Routes to OpenAI -curl -X POST http://localhost:8080/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{"model": "gpt-4", "messages": [{"role": "user", "content": "Hello from OpenAI"}]}' - -# Routes to Anthropic -curl -X POST http://localhost:8080/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{"model": "claude-3-sonnet-20240229", "messages": [{"role": "user", "content": "Hello from Anthropic"}]}' - -# Routes to local Ollama -curl -X POST http://localhost:8080/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{"model": "llama2", "messages": [{"role": "user", "content": "Hello from Ollama"}]}' -``` - -### Restrict API access with virtual keys - -1. Generate an API key: - - ```bash - llm-gateway genkey - # sk-gw-a1b2c3d4e5f6... - ``` - -2. Add `api_keys` to your config, referencing the key and setting per-key model permissions: - - ```yaml - api_keys: - enabled: true - keys: - - name: primary - key: ${PRIMARY_API_KEY} - allowed_models: ["gpt-*", "claude-*"] - - name: local-only - key: ${LOCAL_API_KEY} - allowed_models: ["llama*"] - - providers: - open_ai: - api_key: ${OPENAI_API_KEY} - - anthropic: - api_key: ${ANTHROPIC_API_KEY} - - local: - base_url: http://localhost:11434 - ``` - -3. Clients send their key in the `Authorization` header: - - ```bash - curl -X POST http://localhost:8080/v1/chat/completions \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer sk-gw-a1b2c3d4e5f6..." \ - -d '{"model": "gpt-4", "messages": [{"role": "user", "content": "Hello"}]}' - ``` - -### Use the Python OpenAI client - -The gateway works as a drop-in replacement for the OpenAI Python client. Point `base_url` at the -gateway and it handles provider routing transparently: - -```python -from openai import OpenAI - -client = OpenAI( - base_url="http://localhost:8080/v1", - api_key="not-needed" # gateway handles auth -) - -# Routes to OpenAI -response = client.chat.completions.create( - model="gpt-4o", - messages=[{"role": "user", "content": "Hello!"}] -) - -# Routes to Anthropic (translated automatically) -response = client.chat.completions.create( - model="claude-sonnet-4-20250514", - messages=[{"role": "user", "content": "Hello!"}] -) - -# Routes to local backend (Ollama, vLLM, etc.) -response = client.chat.completions.create( - model="llama3.2", - messages=[{"role": "user", "content": "Hello!"}] -) -``` - -### Semantic routing - -Route requests automatically based on content analysis, without requiring clients to specify a model: - -```yaml -routing: - default_route: general - semantic: - enabled: true - provider: local - model: nomic-embed-text - threshold: 0.75 - ambiguous_threshold: 0.5 - routes: - - name: coding - model: claude-haiku-4-5-20251001 - description: "code generation, debugging, and technical tasks" - examples: - - "write a python function to sort a list" - - "debug this segfault in my C code" - - name: general - model: qwen3-vl:30b - description: "general knowledge and conversation" - examples: - - "what is the capital of France" - - "explain how photosynthesis works" - -providers: - local: - base_url: http://localhost:11434 -``` - -See [Semantic routing](semantic-routing.md) for a full explanation of how routing works. - -### Multi-endpoint load balancing - -Distribute requests across multiple inference backends: - -```yaml -providers: - local: - endpoints: - - name: ollama-primary - base_url: http://localhost:11434 - weight: 2 - - name: ollama-secondary - base_url: http://localhost:11435 - weight: 1 - - name: vllm-endpoint - base_url: http://vllm.example.com:8000 - weight: 1 - - open_ai: - api_key: ${OPENAI_API_KEY} - - anthropic: - api_key: ${ANTHROPIC_API_KEY} -``` - -See [Multi-endpoint load balancing](multi-endpoint.md) for health check and failover options. - -### Connect via zrok - -Share the gateway over a zrok overlay so clients can reach it without a public IP: - -```yaml -zrok: - share: - enabled: true - mode: private - -providers: - local: - base_url: http://localhost:11434 -``` - -Or access a remote inference backend through a zrok share: - -```yaml -providers: - local: - endpoints: - - name: remote-ollama - zrok_share_token: ${ZROK_OLLAMA_TOKEN} -``` - -See [Connect via zrok](connect-zrok.md) for setup details. - -### Production configuration - -A full configuration combining multiple providers, API key authentication, semantic routing, load -balancing, metrics, and zrok: - -```yaml -listen: "0.0.0.0:8080" - -zrok: - share: - enabled: true - token: ${ZROK_SHARE_TOKEN} - -api_keys: - enabled: true - keys: - - name: primary - key: ${PRIMARY_API_KEY} - allowed_models: ["gpt-*", "claude-*", "llama*"] - - name: local-only - key: ${LOCAL_API_KEY} - allowed_models: ["llama*"] - -providers: - open_ai: - api_key: ${OPENAI_API_KEY} - - anthropic: - api_key: ${ANTHROPIC_API_KEY} - - local: - endpoints: - - name: ollama-primary - base_url: http://localhost:11434 - weight: 3 - - name: ollama-secondary - base_url: http://localhost:11435 - weight: 1 - - name: vllm-endpoint - zrok_share_token: ${ZROK_VLLM_TOKEN} - weight: 2 - -routing: - default_route: general - semantic: - enabled: true - provider: local - model: nomic-embed-text - threshold: 0.75 - ambiguous_threshold: 0.5 - routes: - - name: coding - model: claude-haiku-4-5-20251001 - description: "code generation, debugging, and technical tasks" - examples: - - "write a python function to sort a list" - - name: general - model: qwen3-vl:30b - description: "general knowledge and conversation" - examples: - - "what is the capital of France" - -metrics: - enabled: true -``` - -## More info - -- [Configuration](configuration.md): All configuration options -- [Providers](providers.md): How provider routing and format translation work -- [Multi-endpoint load balancing](multi-endpoint.md): Advanced load balancing strategies -- [Virtual API keys](api-keys.md): Client authentication and model-level restrictions -- [Semantic routing](semantic-routing.md): Intelligent request routing based on content -- [Metrics](metrics.md): Prometheus metrics and observability diff --git a/unified-doc/docs/llm-gateway/intro.md b/unified-doc/docs/llm-gateway/intro.md deleted file mode 100644 index be10c234..00000000 --- a/unified-doc/docs/llm-gateway/intro.md +++ /dev/null @@ -1,48 +0,0 @@ ---- -title: NetFoundry LLM Gateway overview -sidebar_label: Overview -description: > - NetFoundry LLM Gateway is an OpenAI-compatible API proxy that routes requests across multiple LLM - providers using zero-trust networking over OpenZiti. ---- - -# NetFoundry LLM Gateway overview - -NetFoundry LLM Gateway is an OpenAI-compatible API proxy that routes requests across multiple LLM -providers using zero-trust networking. The project is open source and can be found at: -[github.com/openziti/llm-gateway](https://github.com/openziti/llm-gateway). - -## What it does - -It handles provider routing, format translation, and zero-trust networking so clients interact with a -single OpenAI-compatible endpoint regardless of which model or provider handles the request. - -- **Multi-provider routing**: Routes requests to OpenAI, Anthropic, and any OpenAI-compatible backend - (Ollama, vLLM, llama-server, SGLang, etc.) by prefix-matching on the model name. -- **Zero-trust networking**: Uses zrok over OpenZiti overlay networks to connect to backends across NAT - and air-gapped environments — no firewall configuration needed. -- **Semantic routing**: A three-layer cascade (keyword heuristics → embedding similarity → LLM classifier) - automatically selects the right model when the client omits a model name. -- **Load balancing**: Weighted round-robin across multiple inference servers with health checks and passive - failover. -- **Single binary**: One Go binary, one YAML config file — no database, message queue, or sidecar. - -## API endpoints - -The gateway exposes standard OpenAI-compatible endpoints: - -| Endpoint | Description | -|---|---| -| `POST /v1/chat/completions` | Chat completions (streaming and non-streaming) | -| `GET /v1/models` | List available models from all providers | -| `GET /health` | Health check | -| `GET /metrics` | Prometheus metrics (when enabled) | - -Streaming works via Server-Sent Events across all providers. Anthropic requests are automatically -translated to and from OpenAI format, so existing tools that speak OpenAI work without changes. - -## Observability - -Prometheus metrics track request volume, latency, token usage, routing decisions, and endpoint health. -Per-request body logging is available for debugging routing behavior. - diff --git a/unified-doc/docs/llm-gateway/metrics.md b/unified-doc/docs/llm-gateway/metrics.md deleted file mode 100644 index 8f1cfb95..00000000 --- a/unified-doc/docs/llm-gateway/metrics.md +++ /dev/null @@ -1,163 +0,0 @@ ---- -title: Metrics -sidebar_label: Metrics ---- - -# Metrics - -The gateway exposes OpenTelemetry metrics via a Prometheus exporter. When enabled, metrics are available -at `GET /metrics` in the standard Prometheus text format. - -## Enabling metrics - -Add the following to your config file: - -```yaml -metrics: - enabled: true -``` - -## Instruments - -All metric names are prefixed with `llm_gateway.`. - -### Request metrics - -These metrics track individual requests through the gateway: - -**`llm_gateway.requests`** (counter): Total chat completion requests. - -| Attribute | Values | Description | -|---|---|---| -| `provider` | `openai`, `anthropic`, `ollama` | Which provider handled the request | -| `model` | Model name | The model used | -| `streaming` | `true`, `false` | Whether the request was streaming | -| `key` | Key name or empty | The API key name (when [virtual API keys](api-keys.md) are enabled) | - ---- - -**`llm_gateway.request.duration`** (histogram, seconds): End-to-end request duration including -upstream provider latency. - -| Attribute | Values | Description | -|---|---|---| -| `provider` | `openai`, `anthropic`, `ollama` | Which provider handled the request | -| `model` | Model name | The model used | -| `key` | Key name or empty | The API key name (when [virtual API keys](api-keys.md) are enabled) | - ---- - -**`llm_gateway.requests.inflight`** (up-down counter): Number of requests currently being processed. -Incremented when a request enters the handler, decremented when it completes. Useful for understanding -concurrency and detecting request pileups. No attributes. - -### Token metrics - -These metrics track token consumption as reported by each provider: - -**`llm_gateway.tokens.prompt`** (counter): Total prompt (input) tokens across all requests. - -| Attribute | Values | Description | -|---|---|---| -| `provider` | Provider name | Which provider reported the usage | -| `model` | Model name | The model used | - ---- - -**`llm_gateway.tokens.completion`** (counter): Total completion (output) tokens across all requests. - -| Attribute | Values | Description | -|---|---|---| -| `provider` | Provider name | Which provider reported the usage | -| `model` | Model name | The model used | - -Token metrics are recorded from the `usage` field in non-streaming responses. Streaming responses -typically don't include token counts. - -### Routing metrics - -This metric tracks how routing decisions are distributed across the cascade layers: - -**`llm_gateway.routing.decisions`** (counter): Semantic routing decisions, counted each time the -router selects a model. - -| Attribute | Values | Description | -|---|---|---| -| `method` | `explicit`, `heuristic`, `semantic`, `classifier`, `default` | Which routing layer made the decision | - -A high proportion of `default` decisions may indicate that thresholds are too strict or that route -examples don't cover your traffic well. - -### Error metrics - -This metric tracks errors returned by upstream providers, broken down by error category: - -**`llm_gateway.provider.errors`** (counter): Errors returned by upstream providers. - -| Attribute | Values | Description | -|---|---|---| -| `error_type` | `invalid_request_error`, `authentication_error`, `rate_limit_error`, `server_error`, `not_found_error`, `service_unavailable`, `unknown` | The error category | - -### Health metrics - -This metric tracks endpoint availability in multi-endpoint mode: - -**`llm_gateway.endpoint.healthy`** (up-down counter): Per-endpoint health status. Value is `1` for -healthy endpoints and `0` for unhealthy endpoints. - -| Attribute | Values | Description | -|---|---|---| -| `endpoint` | Endpoint name | The endpoint being reported on | - -## Prometheus scraping - -Point your Prometheus instance at the gateway's `/metrics` endpoint: - -```yaml -# prometheus.yml -scrape_configs: - - job_name: llm-gateway - scrape_interval: 15s - static_configs: - - targets: ["localhost:8080"] -``` - -## Useful queries - -Some example PromQL queries to get started: - -- Requests per minute by provider: - - ```promql - rate(llm_gateway_requests_total[5m]) * 60 - ``` - -- Average request duration by model: - - ```promql - rate(llm_gateway_request_duration_seconds_sum[5m]) / rate(llm_gateway_request_duration_seconds_count[5m]) - ``` - -- Token throughput (tokens per second): - - ```promql - rate(llm_gateway_tokens_prompt_total[5m]) + rate(llm_gateway_tokens_completion_total[5m]) - ``` - -- Error rate as a percentage of total requests: - - ```promql - rate(llm_gateway_provider_errors_total[5m]) / rate(llm_gateway_requests_total[5m]) * 100 - ``` - -- Routing method distribution: - - ```promql - rate(llm_gateway_routing_decisions_total[5m]) - ``` - -- Current in-flight requests: - - ```promql - llm_gateway_requests_inflight - ``` diff --git a/unified-doc/docs/llm-gateway/multi-endpoint.md b/unified-doc/docs/llm-gateway/multi-endpoint.md deleted file mode 100644 index f7ad3b5a..00000000 --- a/unified-doc/docs/llm-gateway/multi-endpoint.md +++ /dev/null @@ -1,131 +0,0 @@ ---- -title: Multi-endpoint load balancing -sidebar_label: Multi-endpoint load balancing ---- - -# Multi-endpoint load balancing - -When you have multiple inference backends, configure the gateway to distribute requests across them -with automatic health checking and failover. - -## Supported backends - -The gateway works with any OpenAI-compatible backend: - -- Ollama -- vLLM -- llama.cpp -- SGLang -- Any server implementing `POST /v1/chat/completions` - -## Configuration - -Instead of a single `base_url`, define an `endpoints` list: - -```yaml -providers: - local: - endpoints: - - name: ollama-primary - base_url: http://localhost:11434 - weight: 2 - - name: ollama-secondary - base_url: http://localhost:11435 - weight: 1 - - name: vllm-endpoint - base_url: http://vllm.example.com:8000 - weight: 1 -``` - -Each endpoint has: - -- **`name`**: A descriptive name for logging and monitoring. -- **`base_url`**: Direct HTTP access to the backend, or use `zrok_share_token` for overlay network access. -- **`weight`**: Controls traffic distribution proportion. Optional, default `1`. - -The `local` key is the section name — it doesn't restrict which backends you can use. Endpoints can be -any OpenAI-compatible server: Ollama, vLLM, llama.cpp, SGLang, or any custom server. - -To access a remote backend via zrok: - -```yaml -providers: - local: - endpoints: - - name: remote-ollama - zrok_share_token: ${ZROK_OLLAMA_TOKEN} - weight: 1 -``` - -## Load balancing - -The gateway uses **weighted round-robin** load balancing. An endpoint with `weight: 3` receives roughly -3× the requests of an endpoint with `weight: 1`. - -`GET /v1/models` returns the deduplicated union of models from all healthy endpoints. - -## Health checking - -A background process periodically checks endpoint health: - -```yaml -providers: - local: - health_check: - interval_seconds: 30 # check every 30 seconds (default) - timeout_seconds: 5 # per-endpoint timeout (default) -``` - -The health check probes `/v1/models` (standard OpenAI format) or falls back to `/api/tags` (Ollama). - -When an endpoint fails a check, the gateway logs `endpoint 'name' is now unhealthy` and stops sending it -traffic. When it recovers, it logs `endpoint 'name' is now healthy` and resumes normal traffic. Health -checks continue at an exponential backoff schedule — 1× interval after the first failure, up to 10× -after many failures. - -If the system detects a long gap since the last health check (for example, after a VM sleep/wake cycle), -endpoint checks are staggered to avoid flooding the network with simultaneous reconnection attempts. - -## Failover - -When a request fails due to a network problem (connection refused, timeout, etc.), the gateway retries -on the next healthy endpoint. Application-level errors (HTTP 400, 404, etc.) don't trigger failover — -they indicate a problem with the request, not the endpoint. - -## Semantic routing integration - -When semantic routing uses the local provider in multi-endpoint mode, embedding and classifier requests -automatically benefit from the same load distribution and failover via a shared HTTP client. -No additional configuration is needed. - -## Full example - -Three local endpoints with weighted distribution, health checking, and a zrok-connected backup alongside cloud providers: - -```yaml -providers: - local: - endpoints: - - name: ollama-primary - base_url: http://localhost:11434 - weight: 3 - - name: ollama-secondary - base_url: http://localhost:11435 - weight: 1 - - name: vllm-prod - base_url: http://vllm-prod.example.com:8000 - weight: 2 - - name: vllm-backup - zrok_share_token: ${ZROK_VLLM_BACKUP_TOKEN} - weight: 1 - - health_check: - interval_seconds: 30 - timeout_seconds: 5 - - open_ai: - api_key: ${OPENAI_API_KEY} - - anthropic: - api_key: ${ANTHROPIC_API_KEY} -``` diff --git a/unified-doc/docs/llm-gateway/providers.md b/unified-doc/docs/llm-gateway/providers.md deleted file mode 100644 index 6cbea189..00000000 --- a/unified-doc/docs/llm-gateway/providers.md +++ /dev/null @@ -1,157 +0,0 @@ ---- -title: Providers -sidebar_label: Providers ---- - -# Providers - -The gateway presents a single OpenAI-compatible API to clients and translates requests to the -appropriate backend provider. Three provider types are supported: OpenAI (and compatible APIs), -Anthropic, and a local/self-hosted provider for any backend that implements `/v1/chat/completions`. - -## API surface - -All clients interact with the gateway using the [OpenAI chat completions format](https://developers.openai.com/api/reference/chat-completions/overview): - -``` -POST /v1/chat/completions chat completions (streaming and non-streaming) -GET /v1/models list available models from all providers -GET /health health check -GET /metrics Prometheus metrics (when enabled) -``` - -By default, the gateway doesn't require a client API key — authentication is between the gateway and the -upstream providers. Optionally, the gateway can enforce its own [virtual API keys](api-keys.md). - -## Model routing - -Models are routed to providers by prefix-matching on the model name: - -| Prefix | Provider | -|---|---| -| `gpt-*`, `o1-*`, `o3-*` | OpenAI | -| `claude-*` | Anthropic | -| Everything else | Local (configured as `local`) | - -Matching is case-insensitive. A request for `gpt-4` goes to OpenAI; `claude-haiku-4-5-20251001` goes to -Anthropic; `llama3` or `qwen3-vl:30b` go to the local provider. - -If the target provider isn't configured, the gateway returns an error: - -```json -{"error": {"message": "provider 'openai' is not configured", "type": "invalid_request_error"}} -``` - -## OpenAI provider - -The OpenAI provider is a direct pass-through. Requests forward to `POST {base_url}/v1/chat/completions` -with an `Authorization: Bearer` header. Responses are returned unmodified. - -Any OpenAI-compatible API can be used as the OpenAI provider by setting `base_url` — for example, -Azure OpenAI or a local vLLM server. - -Model listing calls `GET {base_url}/v1/models`. - -## Anthropic provider - -The Anthropic provider translates between the OpenAI format and -[Anthropic's Messages API](https://docs.anthropic.com/en/docs/api-reference/messages/create). Clients -send OpenAI-format requests and receive OpenAI-format responses regardless of which provider handles -the request. - -### Request translation - -The gateway maps OpenAI request fields to their Anthropic equivalents before forwarding: - -| OpenAI field | Anthropic field | Notes | -|---|---|---| -| `model` | `model` | Passed through | -| `messages` (role: system) | `system` | First system message becomes Anthropic's top-level `system` field | -| `messages` (role: user) | `messages` (role: user) | | -| `messages` (role: assistant) | `messages` (role: assistant) | | -| `messages` (role: tool) | `messages` (role: user) | Mapped to user role | -| `max_tokens` | `max_tokens` | Defaults to 4096 if not set (Anthropic requires this field) | -| `temperature` | `temperature` | | -| `top_p` | `top_p` | | -| `stop` | `stop_sequences` | String or array | - -### Response translation - -The gateway maps Anthropic response fields back to the OpenAI format before returning to the client: - -| Anthropic field | OpenAI field | Notes | -|---|---|---| -| `id` | `id` | | -| `content[].text` | `choices[0].message.content` | Text blocks are concatenated | -| `usage.input_tokens` | `usage.prompt_tokens` | | -| `usage.output_tokens` | `usage.completion_tokens` | | -| `stop_reason` | `choices[0].finish_reason` | `end_turn`/`stop_sequence` → `stop`; `max_tokens` → `length` | - -### Streaming translation - -Anthropic uses a different streaming event format than OpenAI. The gateway translates on the fly: - -| Anthropic event | Action | -|---|---| -| `message_start` | Captures the message ID for subsequent chunks | -| `content_block_delta` | Emitted as an OpenAI-format `chat.completion.chunk` with the delta text | -| `message_delta` | Emitted as a chunk with the `finish_reason` | -| `message_stop` | Emitted as the `[DONE]` sentinel | - -### Model listing - -Anthropic doesn't have a public models listing endpoint. The provider returns a static list of current -and legacy Claude models. - -### Error translation - -Anthropic error types are mapped to their gateway equivalents: - -| Anthropic error type | Gateway error type | HTTP status | -|---|---|---| -| `authentication_error` | `authentication_error` | 401 | -| `rate_limit_error` | `rate_limit_error` | 429 | -| `invalid_request_error` | `invalid_request_error` | 400 | -| `not_found_error` | `not_found_error` | 404 | -| (other) | `server_error` | 500 | - -## Local / self-hosted provider - -The local provider is a direct pass-through to any OpenAI-compatible backend. Chat completions go to -`POST {base_url}/v1/chat/completions`. This means Ollama, vLLM, llama.cpp, SGLang, or any server -exposing this endpoint can be used. - -Model listing tries `GET {base_url}/v1/models` first, falling back to Ollama's native -`GET {base_url}/api/tags`. - -For multi-endpoint load balancing and failover, see [Multi-endpoint load balancing](multi-endpoint.md). - -## Streaming - -All three providers support streaming via Server-Sent Events (SSE). See [Streaming](streaming.md) for -response format, headers, and how the gateway processes streaming requests. - -## Error handling - -All providers translate upstream errors into a consistent OpenAI-compatible format: - -```json -{ - "error": { - "message": "description of what went wrong", - "type": "error_type", - "param": null, - "code": null - } -} -``` - -| Error type | HTTP status | Typical cause | -|---|---|---| -| `invalid_request_error` | 400 | Malformed request, missing model, provider not configured | -| `authentication_error` | 401 | Invalid API key | -| `permission_error` | 403 | Insufficient permissions | -| `not_found_error` | 404 | Model not found | -| `rate_limit_error` | 429 | Upstream rate limit hit | -| `server_error` | 500 | Provider returned an unexpected error | -| `service_unavailable` | 503 | Provider is down | diff --git a/unified-doc/docs/llm-gateway/semantic-routing.md b/unified-doc/docs/llm-gateway/semantic-routing.md deleted file mode 100644 index d7c368d7..00000000 --- a/unified-doc/docs/llm-gateway/semantic-routing.md +++ /dev/null @@ -1,347 +0,0 @@ ---- -title: Semantic routing -sidebar_label: Semantic routing ---- - -# Semantic routing - -When a request arrives without a `model` field (or with `model: auto`), the gateway uses semantic -routing to decide which backend model should handle it. Routing uses a three-layer cascade: fast -heuristic rules are tried first, then embedding-based similarity, then an LLM classifier. Each layer -can either make a confident decision or pass to the next. If no layer produces a match, the request -falls back to a configured default route. - -## The routing cascade - -The router evaluates layers in order and stops at the first confident result: - -``` -Request arrives - | - v -1. Explicit model? ──yes──> use that model (bypass routing) - │no - v -2. Heuristics match? ──yes──> use matched route - │no - v -3. Embeddings match? - ├─ confident (>= threshold) ──> use matched route - ├─ ambiguous (>= ambiguous_threshold but < threshold) ──> escalate to classifier - └─ no match - v -4. Classifier match? ──yes──> use classified route - │no - v -5. Default route -``` - -Each step appends to a **cascade log** visible in the gateway's output: - -``` -semantic routing: method=semantic route='coding' model='claude-haiku-4-5-20251001' - confidence=0.87 latency=12ms cascade=[heuristic:no_match,semantic:coding:0.87] -``` - -### Explicit model passthrough - -If the client sends a `model` field and `allow_explicit_model` is `true` (the default), the router uses -that model directly without evaluating any layers: - -```yaml -routing: - allow_explicit_model: true # default; set false to force all requests through routing -``` - -### The `auto` virtual model - -Clients that always require a `model` field (such as Open WebUI) can send `model: auto`. The gateway -clears this to an empty string before routing, which triggers the full cascade. When semantic routing -is enabled, `auto` appears in the `/v1/models` endpoint so clients can discover it. - -## Routes - -A route maps a name to a backend model and provides context for the embedding and classifier layers: - -```yaml -routes: - - name: coding - model: claude-haiku-4-5-20251001 - description: "code generation, debugging, code review, and technical programming tasks" - examples: - - "write a python function to sort a list" - - "debug this segfault in my C code" - - "review this pull request for bugs" - - "implement a binary search tree in Go" -``` - -Each field serves a specific role across the routing layers: - -| Field | Used by | Purpose | -|---|---|---| -| `name` | All layers | Identifier for heuristic rules, cascade logs, and classifier output | -| `model` | All layers | The backend model to use when this route is selected | -| `description` | Classifier | Included in the classifier prompt | -| `examples` | Embeddings | Converted to vectors at startup for similarity matching | - -## Layer 1: Heuristics - -Heuristics are fast, deterministic rules evaluated before any model calls: - -```yaml -heuristics: - enabled: true - rules: - - match: - keywords: ["translate", "translation"] - route: general - - match: - has_tools: true - route: tools - - match: - system_prompt_contains: "you are a code assistant" - route: coding - - match: - max_tokens_lt: 100 - message_length_lt: 200 - route: fast -``` - -Rules are evaluated in order. The first matching rule wins. All conditions within a rule must be true -(AND logic). - -### Match conditions - -These conditions can appear in a match block: - -- **`keywords`**: Matched against user messages with word boundaries, case-insensitive. Any single - keyword matching is sufficient. -- **`exclude`**: Phrases that suppress a keyword match. If any exclusion phrase is found, the rule - doesn't match. Useful for filtering out boilerplate text injected by clients like Open WebUI. -- **`system_prompt_contains`**: A substring matched against any system message, case-insensitive. -- **`max_tokens_lt`**: Matches if `max_tokens` is set and strictly less than the given value. -- **`message_length_lt`**: Matches if the total character count across all messages is strictly less - than the given value. -- **`has_tools`**: Matches if the request includes tool definitions (`true`) or does not (`false`). - -### Exclusions - -When using broad keywords, you may encounter false positives from boilerplate text injected by clients: - -```yaml -- match: - keywords: ["code", "debug", "refactor"] - exclude: ["code fences", "code block", "### Task"] - route: coding -``` - -Exclusions are checked first. If any exclusion phrase matches, the rule is skipped entirely. - -## Layer 2: Embeddings - -The embedding layer converts text into numerical vectors and uses cosine similarity to find the closest -route. - -At startup, each route's example prompts are embedded and stored in memory. When a request arrives, the -last user message is embedded and compared against each route's stored vectors. Messages longer than -2048 characters are truncated before embedding. - -### Configuration - -Set the following options under `semantic:` in your routing config: - -```yaml -semantic: - enabled: true - provider: local # local or openai - model: nomic-embed-text # embedding model name - threshold: 0.75 # minimum similarity for a confident match - ambiguous_threshold: 0.5 # below threshold but above this → escalate to classifier - comparison: centroid # centroid, max, or average - cache_embeddings: true # cache prompt embeddings to avoid repeated calls - cache_ttl: 3600 # cache entry lifetime in seconds (default: 3600) - cache_size: 1000 # maximum cache entries (default: 1000) -``` - -### Comparison modes - -Three modes control how the embedding layer compares a request against stored route examples: - -- **`centroid`** (default): Averages all example embeddings into a single vector per route. Fastest. - Works well when examples cluster around a common theme. -- **`max`**: Compares against every example individually and uses the highest score. Good when a route - covers several distinct sub-topics. More prone to false positives. -- **`average`**: Compares against every example individually and uses the mean score. Balanced between - `centroid` and `max`. - -| Situation | Recommended mode | -|---|---| -| Examples per route are similar to each other | `centroid` | -| A route covers several distinct sub-topics | `max` | -| You want balanced "generally like this route" scoring | `average` | - -### Thresholds - -``` -score >= threshold → confident match, return immediately -ambiguous_threshold <= score < threshold → ambiguous, escalate to classifier -score < ambiguous_threshold → no match, continue to next layer -``` - -The right values depend on your embedding model and route structure. Models like `nomic-embed-text` -tend to produce higher similarity scores, so you may need higher thresholds (0.7–0.85 for `threshold`, -0.4–0.6 for `ambiguous_threshold`). - -### Embedding cache - -When `cache_embeddings` is true, prompt embeddings are cached in an LRU cache keyed by a SHA-256 hash -of the prompt text. `cache_size` controls capacity (evicts least recently used when full). - -## Layer 3: LLM classifier - -The classifier sends the user's prompt to a chat model and asks it to classify the request into one of -the configured routes. It's typically used as a fallback for ambiguous embedding results but can also -run standalone. - -### Configuration - -Set the following options under `classifier:` in your routing config: - -```yaml -classifier: - enabled: true - provider: local # local or openai - model: qwen3-vl:30b - timeout_ms: 10000 # request timeout in milliseconds (0 = no timeout) - confidence_threshold: 0.7 # minimum confidence to accept the classification - cache_results: true - cache_ttl: 3600 - cache_size: 500 -``` - -### When the classifier runs - -The classifier is invoked when: - -- The embedding layer found a route but the score was between `ambiguous_threshold` and `threshold`. -- Embeddings are disabled and heuristics found no match. - -The classifier's result is accepted only if the confidence meets or exceeds `confidence_threshold`. - -### Route descriptions matter - -The classifier relies on the `description` field to understand what each route represents. Write -descriptions that are specific enough for an LLM to distinguish between routes — vague descriptions -produce poor classifications. - -The classifier's response may be wrapped in markdown code blocks. The gateway strips those -automatically before parsing the result. - -## Default route - -If no layer produces a confident result, the gateway uses: - -```yaml -routing: - default_route: general -``` - -If `default_route` isn't set, the first route in the `routes` list is the absolute fallback. - -## Example configuration - -A minimal setup using only the embedding layer, with two routes: - -```yaml -routing: - default_route: general - - semantic: - enabled: true - provider: local - model: nomic-embed-text - threshold: 0.75 - ambiguous_threshold: 0.5 - - routes: - - name: coding - model: claude-haiku-4-5-20251001 - description: "code generation, debugging, and technical tasks" - examples: - - "write a python function to sort a list" - - "debug this segfault in my C code" - - - name: general - model: qwen3-vl:30b - description: "general knowledge and conversation" - examples: - - "what is the capital of France" - - "explain how photosynthesis works" -``` - -## Full configuration reference - -All routing options in a single block with defaults shown: - -```yaml -routing: - allow_explicit_model: true - default_route: general - - heuristics: - enabled: true - rules: - - match: - keywords: [...] - exclude: [...] - system_prompt_contains: "..." - max_tokens_lt: 100 - message_length_lt: 200 - has_tools: true - route: route_name - - semantic: - enabled: true - provider: local - model: nomic-embed-text - threshold: 0.75 - ambiguous_threshold: 0.5 - comparison: centroid - cache_embeddings: false # default: false - cache_ttl: 3600 - cache_size: 1000 - - classifier: - enabled: true - provider: local - model: qwen3-vl:30b - timeout_ms: 0 # default: 0 (no timeout) - confidence_threshold: 0 # default: 0 - cache_results: false # default: false - cache_ttl: 3600 - cache_size: 500 - - routes: - - name: coding - model: claude-haiku-4-5-20251001 - description: "code generation, debugging, and technical tasks" - examples: - - "write a python function to sort a list" - - "debug this segfault in my C code" -``` - -## Tuning tips - -A few principles for getting good routing results: - -- **Start simple.** Enable only the embedding layer with a few well-chosen examples per route. Add - heuristics and the classifier later if needed. -- **Add more examples before switching comparison modes.** Four well-chosen examples often solve - problems that changing `comparison` won't. -- **Keep examples realistic.** Use prompts that look like what users actually send. -- **Use heuristics for obvious cases.** If every request containing "translate" should go to the same - route, a keyword heuristic is faster and more reliable than embedding similarity. -- **Watch the cascade logs.** The gateway logs the full cascade for every routed request. This is the - best way to understand why a request was routed where it was. -- **Use metrics for aggregate tuning.** A high proportion of `default` decisions suggests your - thresholds are too strict or your examples don't cover your traffic well. diff --git a/unified-doc/docs/llm-gateway/streaming.md b/unified-doc/docs/llm-gateway/streaming.md deleted file mode 100644 index 519cd81a..00000000 --- a/unified-doc/docs/llm-gateway/streaming.md +++ /dev/null @@ -1,104 +0,0 @@ ---- -title: Streaming -sidebar_label: Streaming ---- - -# Streaming - -All providers support streaming chat completions via Server-Sent Events (SSE). - -## How the gateway handles streaming - -When the client sends `"stream": true`, the gateway: - -1. Sends the request to the upstream provider with streaming enabled. -2. Sets SSE response headers (`Content-Type: text/event-stream`, `Cache-Control: no-cache`, - `X-Accel-Buffering: no`). -3. Reads chunks from the provider as they arrive. -4. Writes each chunk as a `data: {json}\n\n` SSE event and flushes immediately. -5. Sends `data: [DONE]\n\n` when the stream completes. - -## Send a streaming request - -### curl - -Include `"stream": true` in your request to receive incremental token output: - -```bash -curl -X POST http://localhost:8080/v1/chat/completions \ - -H "Content-Type: application/json" \ - -d '{ - "model": "gpt-4", - "messages": [{"role": "user", "content": "Explain quantum entanglement"}], - "stream": true - }' -``` - -### Python - -Use the OpenAI Python client with `stream=True`: - -```python -from openai import OpenAI - -client = OpenAI(base_url="http://localhost:8080/v1", api_key="not-needed") - -stream = client.chat.completions.create( - model="claude-sonnet-4-20250514", - messages=[{"role": "user", "content": "Write a haiku"}], - stream=True, -) - -for chunk in stream: - if chunk.choices[0].delta.content: - print(chunk.choices[0].delta.content, end="") -``` - -## Response format - -The gateway returns a series of SSE events. Each chunk follows the OpenAI format: - -``` -data: {"id":"chatcmpl-abc","object":"chat.completion.chunk","choices":[{"delta":{"content":"Quantum"},"index":0}]} - -data: {"id":"chatcmpl-abc","object":"chat.completion.chunk","choices":[{"delta":{"content":" entanglement"},"index":0}]} - -data: [DONE] -``` - -Each `delta` field contains only the incremental content for that chunk. Clients must accumulate chunks -to reconstruct the full message. - -## Response headers - -The gateway sets these headers on streaming responses: - -| Header | Value | Purpose | -|---|---|---| -| `Content-Type` | `text/event-stream` | identifies the response as SSE | -| `Cache-Control` | `no-cache` | prevents caching of the stream | -| `Connection` | `keep-alive` | keeps the connection open | -| `X-Accel-Buffering` | `no` | disables nginx buffering so chunks reach clients immediately | - -## Provider differences - -**OpenAI and local (Ollama, vLLM, etc.)** — these already produce OpenAI-format SSE streams. The gateway -forwards them directly to the client. - -**Anthropic** — uses a different event protocol. The gateway translates on the fly: - -| Anthropic event | Gateway action | -|---|---| -| `message_start` | captures message ID for subsequent chunks | -| `content_block_delta` | emitted as an OpenAI-format `chat.completion.chunk` | -| `message_delta` | emitted as a chunk with `finish_reason` (`end_turn` → `stop`) | -| `message_stop` | emitted as `data: [DONE]` | - -Translation is transparent — clients receive the same format regardless of which provider handled the -request. - -## Error handling - -If an error occurs before streaming starts, the gateway returns a standard JSON error response. If an -error occurs mid-stream (after the SSE connection is established), it's sent as an SSE event containing -an error JSON object before the connection closes. diff --git a/unified-doc/docs/mcp-gateway/common-servers.md b/unified-doc/docs/mcp-gateway/common-servers.md deleted file mode 100644 index 0443304f..00000000 --- a/unified-doc/docs/mcp-gateway/common-servers.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -title: Common MCP servers -sidebar_label: Common MCP servers ---- - -# Common MCP servers - -The following MCP servers are available from the official `@modelcontextprotocol` npm scope. Install -any of these with `npx -y @modelcontextprotocol/server-`. Any `stdio` MCP server works with -`mcp-bridge` and `mcp-gateway` regardless of language or runtime. - -| Package | Purpose | -|---------|---------| -| `@modelcontextprotocol/server-filesystem` | File operations | -| `@modelcontextprotocol/server-github` | GitHub integration | -| `@modelcontextprotocol/server-fetch` | Web content fetching | -| `@modelcontextprotocol/server-memory` | Knowledge graph memory | -| `@modelcontextprotocol/server-postgres` | PostgreSQL queries | -| `@modelcontextprotocol/server-sqlite` | SQLite database | diff --git a/unified-doc/docs/mcp-gateway/configuration.md b/unified-doc/docs/mcp-gateway/configuration.md deleted file mode 100644 index 17a954a9..00000000 --- a/unified-doc/docs/mcp-gateway/configuration.md +++ /dev/null @@ -1,208 +0,0 @@ ---- -title: Configuration reference -sidebar_label: Configuration ---- - -# Configuration reference - -The gateway is configured with a YAML file passed to `mcp-gateway run`. This page covers every -top-level key and option. - -## Top-level structure - -A complete config file looks like this: - -```yaml -share_token: "my-gateway" # optional — see Persistent shares - -aggregator: - name: "my-gateway" - version: "1.0.0" - separator: ":" - connection: - connect_timeout: 30s - call_timeout: 60s - -backends: - - id: my-backend - transport: - type: stdio - command: my-command - args: ["arg1"] - env: - MY_VAR: "${MY_VAR}" - tools: - mode: allow - list: - - "tool_name" -``` - -## Aggregator settings - -The `aggregator` block configures the gateway's identity and connection behavior: - -- **`name`**: Gateway name, returned in tool-list responses. -- **`version`**: Gateway version, returned in tool-list responses. -- **`separator`**: Character used to namespace tool names (default: `_`). See [Tool namespacing](#tool-namespacing). -- **`connection.connect_timeout`**: Time to wait when connecting to a backend (default: `30s`). -- **`connection.call_timeout`**: Time to wait for a tool call to complete (default: `60s`). - -## Backends - -Each entry in the `backends` list defines one backend MCP server. Every backend requires an `id` -and a `transport` block. - -### Tool namespacing - -The backend `id` is used as the namespace prefix for every tool the backend exposes, combined with -the `separator` set in the `aggregator` block: - -| Backend | Original tool | Namespaced tool | -|---------|---------------|-----------------| -| docs | `read_file` | `docs:read_file` | -| docs | `write_file` | `docs:write_file` | -| data | `read_file` | `data:read_file` | - -Common separator choices: - -| Separator | Example | Notes | -|-----------|---------|-------| -| `_` (default) | `docs_read_file` | Blends in with snake_case names | -| `:` | `docs:read_file` | Visually distinct | -| `-` | `docs-read_file` | Can be ambiguous with hyphenated tool names | - -### Transport types - -- **`stdio`**: Spawns a local process and communicates over stdin/stdout. Use the `env` map to pass - environment variables to the process; values support `${VAR}` substitution from the shell - environment: - - ```yaml - transport: - type: stdio - command: mcp-filesystem - args: ["~/Documents"] - env: - GITHUB_TOKEN: "${GITHUB_TOKEN}" - ``` - -- **`zrok`**: Connects to a remote bridge over the zrok overlay: - - ```yaml - transport: - type: zrok - share_token: "remote-token" - ``` - -- **`https`**: Connects to a remote MCP server over HTTPS. Only accepts `https://` endpoints. - Supports SSE (default) or streamable HTTP transport, with optional custom headers and TLS - configuration. - - With custom headers: - - ```yaml - transport: - type: https - endpoint: "https://mcp.example.com/sse" - headers: - Authorization: "Bearer sk-abc123" - ``` - - With a custom CA cert and streamable HTTP protocol: - - ```yaml - transport: - type: https - endpoint: "https://mcp.internal.corp/mcp" - protocol: "streamable" - tls: - ca_cert_file: "/etc/ssl/certs/internal-ca.pem" - ``` - -- **`http`**: Connects to a remote MCP server over HTTP or HTTPS. Unlike `https`, accepts both - `http://` and `https://` endpoints, but plaintext HTTP requires explicit opt-in: - - ```yaml - transport: - type: http - endpoint: "http://localhost:8080/sse" - allow_insecure: true - ``` - -### Tool filtering - -By default, every tool from every backend is exposed. Use allow or deny lists to control this -per backend. - -**Allow mode**: Only expose tools that match: - -```yaml -tools: - mode: allow - list: - - "read_file" - - "list_directory" -``` - -**Deny mode**: Expose everything except tools that match: - -```yaml -tools: - mode: deny - list: - - "write_file" -``` - -**Glob patterns**: `*` matches any sequence of characters, `?` matches a single character: - -| Pattern | Matches | -|---------|---------| -| `read_file` | Exactly `read_file` | -| `read_*` | `read_file`, `read_dir`, ... | -| `*file` | `read_file`, `write_file` | -| `*` | Everything | - -Omit the `tools` section entirely to expose all tools. - -## Example: Multi-backend configuration - -A three-backend setup combining filesystem access, GitHub, and web fetching: - -```yaml -aggregator: - name: "my-dev-tools" - version: "1.0.0" - separator: ":" - -backends: - - id: filesystem - transport: - type: stdio - command: npx - args: ["-y", "@modelcontextprotocol/server-filesystem", "~/Documents"] - tools: - mode: allow - list: - - "read_file" - - "list_directory" - - "search_files" - - - id: github - transport: - type: stdio - command: npx - args: ["-y", "@modelcontextprotocol/server-github"] - env: - GITHUB_TOKEN: "${GITHUB_TOKEN}" - tools: - mode: deny - list: - - "delete_*" - - "force_*" - - - id: fetch - transport: - type: stdio - command: npx - args: ["-y", "@modelcontextprotocol/server-fetch"] -``` diff --git a/unified-doc/docs/mcp-gateway/get-started.md b/unified-doc/docs/mcp-gateway/get-started.md deleted file mode 100644 index ed528ab3..00000000 --- a/unified-doc/docs/mcp-gateway/get-started.md +++ /dev/null @@ -1,295 +0,0 @@ ---- -title: Get started with NetFoundry MCP Gateway -sidebar_label: Get started ---- - -# Get started - -This guide walks you through NetFoundry MCP Gateway from scratch. You'll start with the simplest possible -setup — a single MCP server exposed over the network — and build up to a full multi-backend gateway -with tool filtering and namespacing. - -## Prerequisites - -Before you begin, you need: - -- **Go 1.25.4+**: For building from source. -- **A zrok v2.0.x account**: Sign up for free at [zrok.io](https://zrok.io) or follow the - `zrok2 invite` instructions below. - -## Part 1: Enable zrok - -NetFoundry MCP Gateway uses [zrok](https://zrok.io) for secure, zero-trust networking. All traffic between -components travels over an OpenZiti overlay network — nothing is ever exposed on a public IP. - -If you already have a zrok v1.x account on zrok.io, the same account token works for enabling a -v2.x environment; the new environment ends up in `~/.zrok2` and appears in your account overview. - -### Request an account - -```bash -zrok2 invite -``` - -Enter your email address. You'll receive an invitation email with your account token. - -### Install zrok - -Download the `zrok2` binary (v2.0.0-rc7 or later) for your platform from the -[releases page](https://github.com/openziti/zrok/releases/tag/v2.0.0-rc7). - -### Enable your environment - -```bash -zrok2 enable -zrok2 status -``` - -## Part 2: Your first MCP server (mcp-bridge + mcp-tools) - -The simplest setup uses two components: - -- **`mcp-bridge`**: Takes a local stdio MCP server and makes it available over the overlay. -- **`mcp-tools`**: Connects to a remote share and bridges it back to stdio. - -Together they let any MCP client talk to an MCP server running anywhere, without opening ports or -configuring firewalls. - -### Install - -Install all components with a single command: - -```bash -go install github.com/openziti/mcp-gateway/cmd/...@latest -``` - -This installs all components: `mcp-gateway`, `mcp-bridge`, `mcp-tools`, and `mcp-filesystem` (a -sandboxed filesystem server included for getting started). - -### Build from source - -Clone the repository and build each binary individually: - -```bash -git clone https://github.com/openziti/mcp-gateway.git -cd mcp-gateway -go build ./cmd/mcp-gateway -go build ./cmd/mcp-bridge -go build ./cmd/mcp-tools -``` - -### Start the bridge - -```bash -mcp-bridge mcp-filesystem ~/Documents -``` - -The bridge spawns `mcp-filesystem ~/Documents`, creates a zrok private share, and prints the share -token: - -```json -{"share_token":"a1b2c3d4e5f6"} -``` - -The share token is the only thing needed to connect. There's no IP address, no port, no DNS name — -the server is a "dark service" that doesn't listen on any network interface. Keep this terminal -running. - -### Connect with mcp-tools - -In a second terminal: - -```bash -mcp-tools run a1b2c3d4e5f6 -``` - -`mcp-tools run` connects to the zrok share and bridges it to stdin/stdout. Any MCP client that speaks -stdio can use this as its transport. - -### Configure Claude Desktop - -Add the share to Claude Desktop's config file: - -| Platform | Path | -|----------|------| -| macOS | `~/Library/Application Support/Claude/claude_desktop_config.json` | -| Windows | `%APPDATA%\Claude\claude_desktop_config.json` | -| Linux | `~/.config/Claude/claude_desktop_config.json` | - -Add the server entry: - -```json -{ - "mcpServers": { - "filesystem": { - "command": "mcp-tools", - "args": ["run", "a1b2c3d4e5f6"] - } - } -} -``` - -Restart Claude Desktop. The `read_file`, `write_file`, and `list_directory` tools will be available. - -## Part 3: Aggregate multiple servers (mcp-gateway) - -`mcp-gateway` aggregates multiple backends and serves them all through a single zrok share. - -### Create a configuration file - -Create `gateway-config.yml`: - -```yaml -aggregator: - name: "my-gateway" - version: "1.0.0" - separator: ":" - -backends: - - id: docs - transport: - type: stdio - command: mcp-filesystem - args: ["~/Documents"] - - - id: data - transport: - type: stdio - command: mcp-filesystem - args: ["~/Data"] - tools: - mode: allow - list: - - "read_file" - - "list_directory" -``` - -### Start the gateway - -Pass the config file path as the argument: - -```bash -mcp-gateway run gateway-config.yml -``` - -```text title="Output" -{"share_token":"x9y8z7w6v5u4"} -``` - -Connect the same way: - -```bash -mcp-tools run x9y8z7w6v5u4 -``` - -The available tools are now namespaced by backend ID: - -| Tool | Source | -|------|--------| -| `docs:read_file` | docs backend | -| `docs:write_file` | docs backend | -| `docs:list_directory` | docs backend | -| `data:read_file` | data backend (filtered to read-only) | -| `data:list_directory` | data backend (filtered to read-only) | - -`data:write_file` is absent because the allow list on the `data` backend only includes -`read_file` and `list_directory`. See [Configuration](configuration.md) for the full list of -aggregator settings, transport types, filtering options, and environment variable syntax. - -## Part 4: Connect remote servers - -You can connect to MCP servers running on other machines using `mcp-bridge` with the `zrok` transport. - -### Run a bridge on a remote machine - -```bash -mcp-bridge mcp-filesystem /data -``` - -```text title="Output" -{"share_token":"remote-token"} -``` - -### Add as a gateway backend - -Reference the remote bridge's share token under a `zrok` transport: - -```yaml -backends: - - id: local - transport: - type: stdio - command: mcp-filesystem - args: ["~/Documents"] - - - id: remote - transport: - type: zrok - share_token: "remote-token" -``` - -The gateway connects over the zrok overlay — no ports to open, no firewall rules. The remote -backend's tools are namespaced and filtered like any other backend. - -Gateways can chain freely: a gateway backend can point to another gateway's share, or to a bridge -running anywhere on the network. - -## Part 5: Connect to your agent - -### Claude Desktop (stdio) - -Add the gateway share to Claude Desktop's config: - -```json -{ - "mcpServers": { - "my-tools": { - "command": "mcp-tools", - "args": ["run", "x9y8z7w6v5u4"] - } - } -} -``` - -### HTTP mode - -For agents or clients that expect an HTTP endpoint: - -```bash -mcp-tools http x9y8z7w6v5u4 --bind 127.0.0.1:8080 -``` - -Options: - -- **`--bind`**: Address to listen on (default: `127.0.0.1:8080`) -- **`--stateless`**: No session persistence -- **`--json-response`**: Prefer JSON responses over SSE streams - -Any MCP client that supports stdio transport can use `mcp-tools run ` directly. For HTTP-based -clients, use `mcp-tools http`. - -**n8n example:** Configure the n8n MCP Client Tool: - -- **URL**: `http://127.0.0.1:8080` -- **Transport**: SSE (default) or streamable HTTP - -## Troubleshooting - -- **"zrok enable" required**: Run `zrok2 enable` with your account token first. - -- **Backend connection failures**: Check that stdio commands are correct and executables are in PATH. - For zrok backends, verify the share token is valid and the remote bridge is running. - -- **Tool not found**: Check the namespace prefix matches the backend ID. Verify the tool isn't filtered - by your allow/deny list. - -- **Debug logging**: Set `PFXLOG_LEVEL=debug` for verbose output: - - ```bash - PFXLOG_LEVEL=debug mcp-gateway run config.yml - ``` - -## What's next - -- [Configuration](configuration.md): Full reference for every aggregator, backend, and filtering option. -- [Persistent shares](persistent-shares.md): Create share tokens that survive restarts for production deployments. diff --git a/unified-doc/docs/mcp-gateway/intro.md b/unified-doc/docs/mcp-gateway/intro.md deleted file mode 100644 index ef4d93df..00000000 --- a/unified-doc/docs/mcp-gateway/intro.md +++ /dev/null @@ -1,59 +0,0 @@ ---- -title: NetFoundry MCP Gateway overview -sidebar_label: Overview -description: > - NetFoundry MCP Gateway enables secure, isolated access to Model Context Protocol (MCP) tools across - distributed systems without exposing public endpoints. ---- - -# NetFoundry MCP Gateway overview - -NetFoundry MCP Gateway enables secure, isolated access to Model Context Protocol (MCP) tools across -distributed systems without exposing public endpoints. The project is open source and can be found at: -[github.com/openziti/mcp-gateway](https://github.com/openziti/mcp-gateway). - -## The problem it solves - -MCP servers typically run locally via stdio. To access tools on remote machines or share them across -a team, you'd normally need to expose endpoints — creating security risks. NetFoundry MCP Gateway solves this -by running everything over OpenZiti's overlay network, so services never listen on public IPs and -require cryptographic identity to access. - -## Components - -NetFoundry MCP Gateway consists of three tools that can be used independently or together: - -- **`mcp-bridge`**: Takes a local stdio MCP server and exposes it over a zrok private share on the - overlay network. -- **`mcp-gateway`**: Aggregates multiple backends (stdio, HTTP, or other bridges) into a single - secure zrok share, with namespacing and tool filtering. -- **`mcp-tools`**: Connects an MCP client to a remote share, bridging it back to stdio or a local - HTTP endpoint. - -## How it works - -All traffic between components travels over an OpenZiti overlay network via [zrok](https://zrok.io). -Nothing is ever exposed on a public IP. Only authorized parties with a valid zrok environment can -connect to a share. This model functions transparently through NATs and firewalls. - -The gateway creates an isolated session for each connecting client. Each client gets dedicated -backend connections — no shared state, no cross-talk between sessions. - -## Quick example - -1. Share a local MCP server over the overlay: - - ```bash - mcp-bridge mcp-filesystem ~/Documents - ``` - - ```text title="Output" - {"share_token":"a1b2c3d4e5f6"} - ``` - -2. Connect to it from anywhere with a zrok-enabled environment: - - ```bash - mcp-tools run a1b2c3d4e5f6 - ``` - diff --git a/unified-doc/docs/mcp-gateway/persistent-shares.md b/unified-doc/docs/mcp-gateway/persistent-shares.md deleted file mode 100644 index 7be183b2..00000000 --- a/unified-doc/docs/mcp-gateway/persistent-shares.md +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: Use persistent shares -sidebar_label: Persistent shares ---- - -# Use persistent shares - -By default, share tokens are ephemeral — they disappear when the process exits. For production use, -create persistent shares that survive restarts and keep a stable token. - -1. Create a persistent share by running this once to reserve a named token: - - ```bash - zrok2 create share my-gateway - ``` - - Share names must be 3–32 characters, lowercase alphanumeric and hyphens (`[a-z0-9-]`). If you omit - the name, zrok generates a random token. - -2. Reference the token in your config: - - **mcp-gateway** — set `share_token` at the top level: - - ```yaml - share_token: "my-gateway" - - aggregator: - name: "my-dev-tools" - version: "1.0.0" - ``` - - **mcp-bridge** — pass `--share-token`: - - ```bash - mcp-bridge --share-token my-bridge mcp-filesystem ~/Documents - ``` - -3. When you no longer need the share, delete it: - - ```bash - zrok2 delete share my-gateway - ``` diff --git a/unified-doc/docusaurus.config.ts b/unified-doc/docusaurus.config.ts index 9474f0e5..2702e0c7 100644 --- a/unified-doc/docusaurus.config.ts +++ b/unified-doc/docusaurus.config.ts @@ -26,6 +26,8 @@ import {onpremRedirects} from "./_remotes/selfhosted/docusaurus/docusaurus-plugi import {platformDocsPluginConfig} from "./_remotes/platform/docusaurus/docusaurus-plugin-platform-docs.ts"; import {openzitiDocsPluginConfig, openzitiRedirects} from "./_remotes/openziti/docusaurus/docusaurus-plugin-openziti-docs.ts"; import {dataconnectorDocsPluginConfig} from "./_remotes/data-connector/docusaurus/docusaurus-plugin-dataconnector-docs.ts"; +import {llmgatewayDocsPluginConfig} from "./_remotes/llm-gateway/docusaurus/docusaurus-plugin-llmgateway-docs.ts"; +import {mcpgatewayDocsPluginConfig} from "./_remotes/mcp-gateway/docusaurus/docusaurus-plugin-mcpgateway-docs.ts"; // This runs in Node.js - Don't use client-side code here (browser APIs, JSX...) const frontdoor = `./_remotes/frontdoor`; @@ -35,8 +37,8 @@ const zrokRoot = `./_remotes/zrok/website`; const zlan = `./_remotes/zlan`; const platform = `./_remotes/platform`; const dataConnector = `./_remotes/data-connector`; -const llmGateway = `./docs/llm-gateway`; -const mcpGateway = `./docs/mcp-gateway`; +const llmGateway = `./_remotes/llm-gateway`; +const mcpGateway = `./_remotes/mcp-gateway`; const isVercel = process.env.IS_VERCEL === 'true'; const docsBase = isVercel ? '/' : '/docs/'; @@ -353,6 +355,8 @@ const config: Config = { '@staticdir': path.resolve(__dirname, `docusaurus/static`), '@platform': path.resolve(__dirname, `${platform}/docusaurus`), '@dataconnector': path.resolve(__dirname, `${dataConnector}/docusaurus`), + '@llm-gateway': path.resolve(__dirname, `${llmGateway}/docusaurus`), + '@mcp-gateway': path.resolve(__dirname, `${mcpGateway}/docusaurus`), }, }, module: { @@ -441,38 +445,16 @@ const config: Config = { routeBase('dataconnector'), ), ), - build(BUILD_FLAGS.LLM_GATEWAY) && [ - '@docusaurus/plugin-content-docs', - { - id: 'llm-gateway', - path: llmGateway, - routeBasePath: routeBase('llm-gateway'), - sidebarPath: './sidebars-llm-gateway.ts', - beforeDefaultRemarkPlugins: [ - remarkGithubAdmonitionsToDirectives, - ], - remarkPlugins: [ - [remarkScopedPath, { mappings: REMARK_MAPPINGS, logLevel: LogLevel.Silent }], - [remarkCodeSections, { logLevel: LogLevel.Silent }], - ], - }, - ], - build(BUILD_FLAGS.MCP_GATEWAY) && [ - '@docusaurus/plugin-content-docs', - { - id: 'mcp-gateway', - path: mcpGateway, - routeBasePath: routeBase('mcp-gateway'), - sidebarPath: './sidebars-mcp-gateway.ts', - beforeDefaultRemarkPlugins: [ - remarkGithubAdmonitionsToDirectives, - ], - remarkPlugins: [ - [remarkScopedPath, { mappings: REMARK_MAPPINGS, logLevel: LogLevel.Silent }], - [remarkCodeSections, { logLevel: LogLevel.Silent }], - ], - }, - ], + build(BUILD_FLAGS.LLM_GATEWAY) && llmgatewayDocsPluginConfig( + `${llmGateway}/docusaurus`, + REMARK_MAPPINGS, + routeBase('llm-gateway'), + ), + build(BUILD_FLAGS.MCP_GATEWAY) && mcpgatewayDocsPluginConfig( + `${mcpGateway}/docusaurus`, + REMARK_MAPPINGS, + routeBase('mcp-gateway'), + ), ['@docusaurus/plugin-sitemap', { changefreq: "daily", priority: 0.8 }], [pluginHotjar, {}], [pluginReo, {}], diff --git a/unified-doc/sidebars-llm-gateway.ts b/unified-doc/sidebars-llm-gateway.ts deleted file mode 100644 index ad901f71..00000000 --- a/unified-doc/sidebars-llm-gateway.ts +++ /dev/null @@ -1,22 +0,0 @@ -import type {SidebarsConfig} from '@docusaurus/plugin-content-docs'; - -const sidebars: SidebarsConfig = { - docsSidebar: [ - {type: 'html', value: 'INTRO', className: 'sidebar-title'}, - {type: 'doc', id: 'intro'}, - {type: 'doc', id: 'get-started'}, - {type: 'html', value: 'HOW-TO', className: 'sidebar-title'}, - {type: 'doc', id: 'connect-zrok'}, - {type: 'html', value: 'LEARN', className: 'sidebar-title'}, - {type: 'doc', id: 'semantic-routing'}, - {type: 'doc', id: 'multi-endpoint'}, - {type: 'html', value: 'REFERENCE', className: 'sidebar-title'}, - {type: 'doc', id: 'configuration'}, - {type: 'doc', id: 'providers'}, - {type: 'doc', id: 'api-keys'}, - {type: 'doc', id: 'streaming'}, - {type: 'doc', id: 'metrics'}, - ], -}; - -export default sidebars; diff --git a/unified-doc/sidebars-mcp-gateway.ts b/unified-doc/sidebars-mcp-gateway.ts deleted file mode 100644 index cb1dc875..00000000 --- a/unified-doc/sidebars-mcp-gateway.ts +++ /dev/null @@ -1,16 +0,0 @@ -import type {SidebarsConfig} from '@docusaurus/plugin-content-docs'; - -const sidebars: SidebarsConfig = { - docsSidebar: [ - {type: 'html', value: 'INTRO', className: 'sidebar-title'}, - {type: 'doc', id: 'intro'}, - {type: 'doc', id: 'get-started'}, - {type: 'html', value: 'HOW-TO', className: 'sidebar-title'}, - {type: 'doc', id: 'persistent-shares'}, - {type: 'html', value: 'REFERENCE', className: 'sidebar-title'}, - {type: 'doc', id: 'configuration'}, - {type: 'doc', id: 'common-servers'}, - ], -}; - -export default sidebars;