Add Anthropic prompt caching (direct + via OpenRouter)

Caches the system prompt/tools and growing conversation history via cache_control breakpoints, cutting cost and latency on repeated turns. Covers both the regular chat path and the tool-calling loop (chatWithToolMessages), which has its own request-building code and was initially missed. Cost calculation now accounts for cache write/read pricing instead of treating all input tokens as full price. Verified live: cache reads grow turn-over-turn in oAI.log. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-06-18 12:43:32 +02:00
parent a793fdacc4
commit 5b99a6f81c
5 changed files with 131 additions and 21 deletions
@@ -934,10 +934,7 @@ Don't narrate future actions ("Let me...") - just use the tools.
                            messages[index].tokens = usage.completionTokens
                            if let model = selectedModel {
                                let hasPricing = model.pricing.prompt > 0 || model.pricing.completion > 0
-                                let cost: Double? = hasPricing
-                                    ? (Double(usage.promptTokens) * model.pricing.prompt / 1_000_000) +
-                                      (Double(usage.completionTokens) * model.pricing.completion / 1_000_000)
-                                    : nil
+                                let cost: Double? = hasPricing ? calculateCost(usage: usage, pricing: model.pricing) : nil
                                messages[index].cost = cost
                                sessionStats.addMessage(inputTokens: usage.promptTokens, outputTokens: usage.completionTokens, cost: cost)
                            }
@@ -1001,10 +998,7 @@ Don't narrate future actions ("Let me...") - just use the tools.
                            messages[index].tokens = usage.completionTokens
                            if let model = selectedModel {
                                let hasPricing = model.pricing.prompt > 0 || model.pricing.completion > 0
-                                let cost: Double? = hasPricing
-                                    ? (Double(usage.promptTokens) * model.pricing.prompt / 1_000_000) +
-                                      (Double(usage.completionTokens) * model.pricing.completion / 1_000_000)
-                                    : nil
+                                let cost: Double? = hasPricing ? calculateCost(usage: usage, pricing: model.pricing) : nil
                                messages[index].cost = cost
                                sessionStats.addMessage(inputTokens: usage.promptTokens, outputTokens: usage.completionTokens, cost: cost)
                            }
@@ -1529,10 +1523,7 @@ Don't narrate future actions ("Let me...") - just use the tools.
                // Calculate cost
                if let usage = totalUsage, let model = selectedModel {
                    let hasPricing = model.pricing.prompt > 0 || model.pricing.completion > 0
-                    let cost: Double? = hasPricing
-                        ? (Double(usage.promptTokens) * model.pricing.prompt / 1_000_000) +
-                          (Double(usage.completionTokens) * model.pricing.completion / 1_000_000)
-                        : nil
+                    let cost: Double? = hasPricing ? calculateCost(usage: usage, pricing: model.pricing) : nil
                    if let index = messages.lastIndex(where: { $0.id == assistantMessage.id }) {
                        messages[index].cost = cost
                    }
@@ -2180,6 +2171,18 @@ Don't narrate future actions ("Let me...") - just use the tools.
        }
    }

+    /// Cost for one response's usage, accounting for Anthropic-style prompt-cache
+    /// pricing when present: cache writes cost 1.25x the base input rate, cache
+    /// reads cost 0.1x. `usage.promptTokens` is already the uncached remainder —
+    /// it does not need cache tokens subtracted from it.
+    private func calculateCost(usage: ChatResponse.Usage, pricing: ModelInfo.Pricing) -> Double {
+        let inputCost = Double(usage.promptTokens) * pricing.prompt / 1_000_000
+        let cacheReadCost = Double(usage.cacheReadInputTokens ?? 0) * pricing.prompt * 0.1 / 1_000_000
+        let cacheWriteCost = Double(usage.cacheCreationInputTokens ?? 0) * pricing.prompt * 1.25 / 1_000_000
+        let outputCost = Double(usage.completionTokens) * pricing.completion / 1_000_000
+        return inputCost + cacheReadCost + cacheWriteCost + outputCost
+    }
+
    /// Summarize a chunk of messages into a concise summary
    private func summarizeMessageChunk(_ messages: [Message]) async -> String? {
        guard let provider = providerRegistry.getProvider(for: currentProvider),