Add Anthropic prompt caching (direct + via OpenRouter)

Caches the system prompt/tools and growing conversation history via cache_control breakpoints, cutting cost and latency on repeated turns. Covers both the regular chat path and the tool-calling loop (chatWithToolMessages), which has its own request-building code and was initially missed. Cost calculation now accounts for cache write/read pricing instead of treating all input tokens as full price. Verified live: cache reads grow turn-over-turn in oAI.log. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-18 12:43:32 +02:00
parent a793fdacc4
commit 5b99a6f81c
5 changed files with 131 additions and 21 deletions
@@ -366,6 +366,19 @@ class AnthropicProvider: AIProvider {
            }
        }

+        // Mark the last message with a cache breakpoint so the next loop
+        // iteration (or next turn) can reuse everything up through this one.
+        if var lastMessage = conversationMessages.popLast() {
+            if let content = lastMessage["content"] as? String {
+                lastMessage["content"] = [["type": "text", "text": content, "cache_control": ["type": "ephemeral"]]]
+            } else if var blocks = lastMessage["content"] as? [[String: Any]], var lastBlock = blocks.popLast() {
+                lastBlock["cache_control"] = ["type": "ephemeral"]
+                blocks.append(lastBlock)
+                lastMessage["content"] = blocks
+            }
+            conversationMessages.append(lastMessage)
+        }
+
        var body: [String: Any] = [
            "model": model,
            "messages": conversationMessages,
@@ -373,7 +386,9 @@ class AnthropicProvider: AIProvider {
            "stream": false
        ]
        if let systemText = systemText {
-            body["system"] = systemText
+            // Array form carries a cache breakpoint; also covers tools, which
+            // render before system in Anthropic's prefix order.
+            body["system"] = [["type": "text", "text": systemText, "cache_control": ["type": "ephemeral"]]]
        }
        if let temperature = temperature {
            body["temperature"] = temperature
@@ -440,6 +455,8 @@ class AnthropicProvider: AIProvider {
                    var currentId = ""
                    var currentModel = request.model
                    var inputTokens = 0
+                    var cacheCreationTokens: Int? = nil
+                    var cacheReadTokens: Int? = nil

                    for try await line in bytes.lines {
                        // Anthropic SSE: "event: ..." and "data: {...}"
@@ -459,6 +476,11 @@ class AnthropicProvider: AIProvider {
                                currentModel = message["model"] as? String ?? request.model
                                if let usageDict = message["usage"] as? [String: Any] {
                                    inputTokens = usageDict["input_tokens"] as? Int ?? 0
+                                    cacheCreationTokens = usageDict["cache_creation_input_tokens"] as? Int
+                                    cacheReadTokens = usageDict["cache_read_input_tokens"] as? Int
+                                    if cacheCreationTokens != nil || cacheReadTokens != nil {
+                                        Log.api.info("Anthropic stream cache usage: input=\(inputTokens), created=\(cacheCreationTokens ?? 0), read=\(cacheReadTokens ?? 0)")
+                                    }
                                }
                            }

@@ -482,7 +504,13 @@ class AnthropicProvider: AIProvider {
                            var usage: ChatResponse.Usage? = nil
                            if let usageDict = event["usage"] as? [String: Any] {
                                let outputTokens = usageDict["output_tokens"] as? Int ?? 0
-                                usage = ChatResponse.Usage(promptTokens: inputTokens, completionTokens: outputTokens, totalTokens: inputTokens + outputTokens)
+                                usage = ChatResponse.Usage(
+                                    promptTokens: inputTokens,
+                                    completionTokens: outputTokens,
+                                    totalTokens: inputTokens + outputTokens,
+                                    cacheCreationInputTokens: cacheCreationTokens,
+                                    cacheReadInputTokens: cacheReadTokens
+                                )
                            }
                            continuation.yield(StreamChunk(
                                id: currentId,
@@ -592,6 +620,19 @@ class AnthropicProvider: AIProvider {
            }
        }

+        // Mark the last message with a cache breakpoint so the next turn can
+        // reuse everything up through this one as a cached prefix.
+        if var lastMessage = apiMessages.popLast() {
+            if let content = lastMessage["content"] as? String {
+                lastMessage["content"] = [["type": "text", "text": content, "cache_control": ["type": "ephemeral"]]]
+            } else if var blocks = lastMessage["content"] as? [[String: Any]], var lastBlock = blocks.popLast() {
+                lastBlock["cache_control"] = ["type": "ephemeral"]
+                blocks.append(lastBlock)
+                lastMessage["content"] = blocks
+            }
+            apiMessages.append(lastMessage)
+        }
+
        var body: [String: Any] = [
            "model": request.model,
            "messages": apiMessages,
@@ -600,7 +641,10 @@ class AnthropicProvider: AIProvider {
        ]

        if let systemText = systemText {
-            body["system"] = systemText
+            // Array form (rather than a plain string) carries a cache breakpoint.
+            // Per Anthropic's render order (tools -> system -> messages), this
+            // single breakpoint caches the tool definitions too.
+            body["system"] = [["type": "text", "text": systemText, "cache_control": ["type": "ephemeral"]]]
        }
        if let temperature = request.temperature {
            body["temperature"] = temperature
@@ -675,6 +719,11 @@ class AnthropicProvider: AIProvider {
        let usageDict = json["usage"] as? [String: Any]
        let inputTokens = usageDict?["input_tokens"] as? Int ?? 0
        let outputTokens = usageDict?["output_tokens"] as? Int ?? 0
+        let cacheCreationTokens = usageDict?["cache_creation_input_tokens"] as? Int
+        let cacheReadTokens = usageDict?["cache_read_input_tokens"] as? Int
+        if cacheCreationTokens != nil || cacheReadTokens != nil {
+            Log.api.info("Anthropic cache usage: input=\(inputTokens), created=\(cacheCreationTokens ?? 0), read=\(cacheReadTokens ?? 0)")
+        }

        return ChatResponse(
            id: id,
@@ -685,7 +734,9 @@ class AnthropicProvider: AIProvider {
            usage: ChatResponse.Usage(
                promptTokens: inputTokens,
                completionTokens: outputTokens,
-                totalTokens: inputTokens + outputTokens
+                totalTokens: inputTokens + outputTokens,
+                cacheCreationInputTokens: cacheCreationTokens,
+                cacheReadInputTokens: cacheReadTokens
            ),
            created: Date(),
            toolCalls: toolCalls.isEmpty ? nil : toolCalls