Add Anthropic prompt caching (direct + via OpenRouter)

Caches the system prompt/tools and growing conversation history via
cache_control breakpoints, cutting cost and latency on repeated turns.
Covers both the regular chat path and the tool-calling loop
(chatWithToolMessages), which has its own request-building code and was
initially missed. Cost calculation now accounts for cache write/read
pricing instead of treating all input tokens as full price. Verified
live: cache reads grow turn-over-turn in oAI.log.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-06-18 12:43:32 +02:00
parent a793fdacc4
commit 5b99a6f81c
5 changed files with 131 additions and 21 deletions
+15 -12
View File
@@ -934,10 +934,7 @@ Don't narrate future actions ("Let me...") - just use the tools.
messages[index].tokens = usage.completionTokens
if let model = selectedModel {
let hasPricing = model.pricing.prompt > 0 || model.pricing.completion > 0
let cost: Double? = hasPricing
? (Double(usage.promptTokens) * model.pricing.prompt / 1_000_000) +
(Double(usage.completionTokens) * model.pricing.completion / 1_000_000)
: nil
let cost: Double? = hasPricing ? calculateCost(usage: usage, pricing: model.pricing) : nil
messages[index].cost = cost
sessionStats.addMessage(inputTokens: usage.promptTokens, outputTokens: usage.completionTokens, cost: cost)
}
@@ -1001,10 +998,7 @@ Don't narrate future actions ("Let me...") - just use the tools.
messages[index].tokens = usage.completionTokens
if let model = selectedModel {
let hasPricing = model.pricing.prompt > 0 || model.pricing.completion > 0
let cost: Double? = hasPricing
? (Double(usage.promptTokens) * model.pricing.prompt / 1_000_000) +
(Double(usage.completionTokens) * model.pricing.completion / 1_000_000)
: nil
let cost: Double? = hasPricing ? calculateCost(usage: usage, pricing: model.pricing) : nil
messages[index].cost = cost
sessionStats.addMessage(inputTokens: usage.promptTokens, outputTokens: usage.completionTokens, cost: cost)
}
@@ -1529,10 +1523,7 @@ Don't narrate future actions ("Let me...") - just use the tools.
// Calculate cost
if let usage = totalUsage, let model = selectedModel {
let hasPricing = model.pricing.prompt > 0 || model.pricing.completion > 0
let cost: Double? = hasPricing
? (Double(usage.promptTokens) * model.pricing.prompt / 1_000_000) +
(Double(usage.completionTokens) * model.pricing.completion / 1_000_000)
: nil
let cost: Double? = hasPricing ? calculateCost(usage: usage, pricing: model.pricing) : nil
if let index = messages.lastIndex(where: { $0.id == assistantMessage.id }) {
messages[index].cost = cost
}
@@ -2180,6 +2171,18 @@ Don't narrate future actions ("Let me...") - just use the tools.
}
}
/// Cost for one response's usage, accounting for Anthropic-style prompt-cache
/// pricing when present: cache writes cost 1.25x the base input rate, cache
/// reads cost 0.1x. `usage.promptTokens` is already the uncached remainder
/// it does not need cache tokens subtracted from it.
private func calculateCost(usage: ChatResponse.Usage, pricing: ModelInfo.Pricing) -> Double {
let inputCost = Double(usage.promptTokens) * pricing.prompt / 1_000_000
let cacheReadCost = Double(usage.cacheReadInputTokens ?? 0) * pricing.prompt * 0.1 / 1_000_000
let cacheWriteCost = Double(usage.cacheCreationInputTokens ?? 0) * pricing.prompt * 1.25 / 1_000_000
let outputCost = Double(usage.completionTokens) * pricing.completion / 1_000_000
return inputCost + cacheReadCost + cacheWriteCost + outputCost
}
/// Summarize a chunk of messages into a concise summary
private func summarizeMessageChunk(_ messages: [Message]) async -> String? {
guard let provider = providerRegistry.getProvider(for: currentProvider),