Add Anthropic prompt caching (direct + via OpenRouter)

Caches the system prompt/tools and growing conversation history via cache_control breakpoints, cutting cost and latency on repeated turns. Covers both the regular chat path and the tool-calling loop (chatWithToolMessages), which has its own request-building code and was initially missed. Cost calculation now accounts for cache write/read pricing instead of treating all input tokens as full price. Verified live: cache reads grow turn-over-turn in oAI.log. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-18 12:43:32 +02:00
parent a793fdacc4
commit 5b99a6f81c
5 changed files with 131 additions and 21 deletions
@@ -198,6 +198,11 @@ class OpenRouterProvider: AIProvider {
        }
        if let maxTokens = maxTokens { body["max_tokens"] = maxTokens }
        if let temperature = temperature { body["temperature"] = temperature }
+        // Anthropic models require an explicit cache_control opt-in on OpenRouter;
+        // other providers cache automatically.
+        if model.hasPrefix("anthropic/") {
+            body["cache_control"] = ["type": "ephemeral"]
+        }

        var urlRequest = URLRequest(url: url)
        urlRequest.httpMethod = "POST"
@@ -388,6 +393,12 @@ class OpenRouterProvider: AIProvider {
            ReasoningAPIConfig(effort: $0.effort, exclude: $0.exclude ? true : nil)
        }

+        // Anthropic models require an explicit cache_control opt-in on OpenRouter;
+        // other providers (OpenAI, DeepSeek, Gemini, Grok, etc.) cache automatically.
+        let cacheControl: OpenRouterChatRequest.CacheControl? = effectiveModel.hasPrefix("anthropic/")
+            ? .init(type: "ephemeral")
+            : nil
+
        return OpenRouterChatRequest(
            model: effectiveModel,
            messages: apiMessages,
@@ -398,7 +409,8 @@ class OpenRouterProvider: AIProvider {
            tools: request.tools,
            toolChoice: request.tools != nil ? "auto" : nil,
            modalities: request.imageGeneration ? ["text", "image"] : nil,
-            reasoning: reasoningConfig
+            reasoning: reasoningConfig,
+            cacheControl: cacheControl
        )
    }
    
@@ -416,6 +428,11 @@ class OpenRouterProvider: AIProvider {
        let allImages = topLevelImages + blockImages
        let images: [Data]? = allImages.isEmpty ? nil : allImages

+        if let details = apiResponse.usage?.promptTokensDetails,
+           details.cachedTokens != nil || details.cacheWriteTokens != nil {
+            Log.api.info("OpenRouter cache usage: model=\(apiResponse.model), created=\(details.cacheWriteTokens ?? 0), read=\(details.cachedTokens ?? 0)")
+        }
+
        return ChatResponse(
            id: apiResponse.id,
            model: apiResponse.model,
@@ -426,7 +443,9 @@ class OpenRouterProvider: AIProvider {
                ChatResponse.Usage(
                    promptTokens: usage.promptTokens,
                    completionTokens: usage.completionTokens,
-                    totalTokens: usage.totalTokens
+                    totalTokens: usage.totalTokens,
+                    cacheCreationInputTokens: usage.promptTokensDetails?.cacheWriteTokens,
+                    cacheReadInputTokens: usage.promptTokensDetails?.cachedTokens
                )
            },
            created: Date(timeIntervalSince1970: TimeInterval(apiResponse.created)),
@@ -446,6 +465,11 @@ class OpenRouterProvider: AIProvider {
        let allImages = topLevelImages + blockImages
        let images: [Data]? = allImages.isEmpty ? nil : allImages

+        if let details = apiChunk.usage?.promptTokensDetails,
+           details.cachedTokens != nil || details.cacheWriteTokens != nil {
+            Log.api.info("OpenRouter stream cache usage: model=\(apiChunk.model), created=\(details.cacheWriteTokens ?? 0), read=\(details.cachedTokens ?? 0)")
+        }
+
        return StreamChunk(
            id: apiChunk.id,
            model: apiChunk.model,
@@ -460,7 +484,9 @@ class OpenRouterProvider: AIProvider {
                ChatResponse.Usage(
                    promptTokens: usage.promptTokens,
                    completionTokens: usage.completionTokens,
-                    totalTokens: usage.totalTokens
+                    totalTokens: usage.totalTokens,
+                    cacheCreationInputTokens: usage.promptTokensDetails?.cacheWriteTokens,
+                    cacheReadInputTokens: usage.promptTokensDetails?.cachedTokens
                )
            }
        )