Add Anthropic prompt caching (direct + via OpenRouter)
Caches the system prompt/tools and growing conversation history via cache_control breakpoints, cutting cost and latency on repeated turns. Covers both the regular chat path and the tool-calling loop (chatWithToolMessages), which has its own request-building code and was initially missed. Cost calculation now accounts for cache write/read pricing instead of treating all input tokens as full price. Verified live: cache reads grow turn-over-turn in oAI.log. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -934,10 +934,7 @@ Don't narrate future actions ("Let me...") - just use the tools.
|
||||
messages[index].tokens = usage.completionTokens
|
||||
if let model = selectedModel {
|
||||
let hasPricing = model.pricing.prompt > 0 || model.pricing.completion > 0
|
||||
let cost: Double? = hasPricing
|
||||
? (Double(usage.promptTokens) * model.pricing.prompt / 1_000_000) +
|
||||
(Double(usage.completionTokens) * model.pricing.completion / 1_000_000)
|
||||
: nil
|
||||
let cost: Double? = hasPricing ? calculateCost(usage: usage, pricing: model.pricing) : nil
|
||||
messages[index].cost = cost
|
||||
sessionStats.addMessage(inputTokens: usage.promptTokens, outputTokens: usage.completionTokens, cost: cost)
|
||||
}
|
||||
@@ -1001,10 +998,7 @@ Don't narrate future actions ("Let me...") - just use the tools.
|
||||
messages[index].tokens = usage.completionTokens
|
||||
if let model = selectedModel {
|
||||
let hasPricing = model.pricing.prompt > 0 || model.pricing.completion > 0
|
||||
let cost: Double? = hasPricing
|
||||
? (Double(usage.promptTokens) * model.pricing.prompt / 1_000_000) +
|
||||
(Double(usage.completionTokens) * model.pricing.completion / 1_000_000)
|
||||
: nil
|
||||
let cost: Double? = hasPricing ? calculateCost(usage: usage, pricing: model.pricing) : nil
|
||||
messages[index].cost = cost
|
||||
sessionStats.addMessage(inputTokens: usage.promptTokens, outputTokens: usage.completionTokens, cost: cost)
|
||||
}
|
||||
@@ -1529,10 +1523,7 @@ Don't narrate future actions ("Let me...") - just use the tools.
|
||||
// Calculate cost
|
||||
if let usage = totalUsage, let model = selectedModel {
|
||||
let hasPricing = model.pricing.prompt > 0 || model.pricing.completion > 0
|
||||
let cost: Double? = hasPricing
|
||||
? (Double(usage.promptTokens) * model.pricing.prompt / 1_000_000) +
|
||||
(Double(usage.completionTokens) * model.pricing.completion / 1_000_000)
|
||||
: nil
|
||||
let cost: Double? = hasPricing ? calculateCost(usage: usage, pricing: model.pricing) : nil
|
||||
if let index = messages.lastIndex(where: { $0.id == assistantMessage.id }) {
|
||||
messages[index].cost = cost
|
||||
}
|
||||
@@ -2180,6 +2171,18 @@ Don't narrate future actions ("Let me...") - just use the tools.
|
||||
}
|
||||
}
|
||||
|
||||
/// Cost for one response's usage, accounting for Anthropic-style prompt-cache
|
||||
/// pricing when present: cache writes cost 1.25x the base input rate, cache
|
||||
/// reads cost 0.1x. `usage.promptTokens` is already the uncached remainder —
|
||||
/// it does not need cache tokens subtracted from it.
|
||||
private func calculateCost(usage: ChatResponse.Usage, pricing: ModelInfo.Pricing) -> Double {
|
||||
let inputCost = Double(usage.promptTokens) * pricing.prompt / 1_000_000
|
||||
let cacheReadCost = Double(usage.cacheReadInputTokens ?? 0) * pricing.prompt * 0.1 / 1_000_000
|
||||
let cacheWriteCost = Double(usage.cacheCreationInputTokens ?? 0) * pricing.prompt * 1.25 / 1_000_000
|
||||
let outputCost = Double(usage.completionTokens) * pricing.completion / 1_000_000
|
||||
return inputCost + cacheReadCost + cacheWriteCost + outputCost
|
||||
}
|
||||
|
||||
/// Summarize a chunk of messages into a concise summary
|
||||
private func summarizeMessageChunk(_ messages: [Message]) async -> String? {
|
||||
guard let provider = providerRegistry.getProvider(for: currentProvider),
|
||||
|
||||
Reference in New Issue
Block a user