Add Anthropic prompt caching (direct + via OpenRouter)

Caches the system prompt/tools and growing conversation history via
cache_control breakpoints, cutting cost and latency on repeated turns.
Covers both the regular chat path and the tool-calling loop
(chatWithToolMessages), which has its own request-building code and was
initially missed. Cost calculation now accounts for cache write/read
pricing instead of treating all input tokens as full price. Verified
live: cache reads grow turn-over-turn in oAI.log.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-06-18 12:43:32 +02:00
parent a793fdacc4
commit 5b99a6f81c
5 changed files with 131 additions and 21 deletions
+12
View File
@@ -130,11 +130,23 @@ struct ChatResponse: Codable {
let promptTokens: Int let promptTokens: Int
let completionTokens: Int let completionTokens: Int
let totalTokens: Int let totalTokens: Int
let cacheCreationInputTokens: Int?
let cacheReadInputTokens: Int?
init(promptTokens: Int, completionTokens: Int, totalTokens: Int, cacheCreationInputTokens: Int? = nil, cacheReadInputTokens: Int? = nil) {
self.promptTokens = promptTokens
self.completionTokens = completionTokens
self.totalTokens = totalTokens
self.cacheCreationInputTokens = cacheCreationInputTokens
self.cacheReadInputTokens = cacheReadInputTokens
}
enum CodingKeys: String, CodingKey { enum CodingKeys: String, CodingKey {
case promptTokens = "prompt_tokens" case promptTokens = "prompt_tokens"
case completionTokens = "completion_tokens" case completionTokens = "completion_tokens"
case totalTokens = "total_tokens" case totalTokens = "total_tokens"
case cacheCreationInputTokens = "cache_creation_input_tokens"
case cacheReadInputTokens = "cache_read_input_tokens"
} }
} }
+55 -4
View File
@@ -366,6 +366,19 @@ class AnthropicProvider: AIProvider {
} }
} }
// Mark the last message with a cache breakpoint so the next loop
// iteration (or next turn) can reuse everything up through this one.
if var lastMessage = conversationMessages.popLast() {
if let content = lastMessage["content"] as? String {
lastMessage["content"] = [["type": "text", "text": content, "cache_control": ["type": "ephemeral"]]]
} else if var blocks = lastMessage["content"] as? [[String: Any]], var lastBlock = blocks.popLast() {
lastBlock["cache_control"] = ["type": "ephemeral"]
blocks.append(lastBlock)
lastMessage["content"] = blocks
}
conversationMessages.append(lastMessage)
}
var body: [String: Any] = [ var body: [String: Any] = [
"model": model, "model": model,
"messages": conversationMessages, "messages": conversationMessages,
@@ -373,7 +386,9 @@ class AnthropicProvider: AIProvider {
"stream": false "stream": false
] ]
if let systemText = systemText { if let systemText = systemText {
body["system"] = systemText // Array form carries a cache breakpoint; also covers tools, which
// render before system in Anthropic's prefix order.
body["system"] = [["type": "text", "text": systemText, "cache_control": ["type": "ephemeral"]]]
} }
if let temperature = temperature { if let temperature = temperature {
body["temperature"] = temperature body["temperature"] = temperature
@@ -440,6 +455,8 @@ class AnthropicProvider: AIProvider {
var currentId = "" var currentId = ""
var currentModel = request.model var currentModel = request.model
var inputTokens = 0 var inputTokens = 0
var cacheCreationTokens: Int? = nil
var cacheReadTokens: Int? = nil
for try await line in bytes.lines { for try await line in bytes.lines {
// Anthropic SSE: "event: ..." and "data: {...}" // Anthropic SSE: "event: ..." and "data: {...}"
@@ -459,6 +476,11 @@ class AnthropicProvider: AIProvider {
currentModel = message["model"] as? String ?? request.model currentModel = message["model"] as? String ?? request.model
if let usageDict = message["usage"] as? [String: Any] { if let usageDict = message["usage"] as? [String: Any] {
inputTokens = usageDict["input_tokens"] as? Int ?? 0 inputTokens = usageDict["input_tokens"] as? Int ?? 0
cacheCreationTokens = usageDict["cache_creation_input_tokens"] as? Int
cacheReadTokens = usageDict["cache_read_input_tokens"] as? Int
if cacheCreationTokens != nil || cacheReadTokens != nil {
Log.api.info("Anthropic stream cache usage: input=\(inputTokens), created=\(cacheCreationTokens ?? 0), read=\(cacheReadTokens ?? 0)")
}
} }
} }
@@ -482,7 +504,13 @@ class AnthropicProvider: AIProvider {
var usage: ChatResponse.Usage? = nil var usage: ChatResponse.Usage? = nil
if let usageDict = event["usage"] as? [String: Any] { if let usageDict = event["usage"] as? [String: Any] {
let outputTokens = usageDict["output_tokens"] as? Int ?? 0 let outputTokens = usageDict["output_tokens"] as? Int ?? 0
usage = ChatResponse.Usage(promptTokens: inputTokens, completionTokens: outputTokens, totalTokens: inputTokens + outputTokens) usage = ChatResponse.Usage(
promptTokens: inputTokens,
completionTokens: outputTokens,
totalTokens: inputTokens + outputTokens,
cacheCreationInputTokens: cacheCreationTokens,
cacheReadInputTokens: cacheReadTokens
)
} }
continuation.yield(StreamChunk( continuation.yield(StreamChunk(
id: currentId, id: currentId,
@@ -592,6 +620,19 @@ class AnthropicProvider: AIProvider {
} }
} }
// Mark the last message with a cache breakpoint so the next turn can
// reuse everything up through this one as a cached prefix.
if var lastMessage = apiMessages.popLast() {
if let content = lastMessage["content"] as? String {
lastMessage["content"] = [["type": "text", "text": content, "cache_control": ["type": "ephemeral"]]]
} else if var blocks = lastMessage["content"] as? [[String: Any]], var lastBlock = blocks.popLast() {
lastBlock["cache_control"] = ["type": "ephemeral"]
blocks.append(lastBlock)
lastMessage["content"] = blocks
}
apiMessages.append(lastMessage)
}
var body: [String: Any] = [ var body: [String: Any] = [
"model": request.model, "model": request.model,
"messages": apiMessages, "messages": apiMessages,
@@ -600,7 +641,10 @@ class AnthropicProvider: AIProvider {
] ]
if let systemText = systemText { if let systemText = systemText {
body["system"] = systemText // Array form (rather than a plain string) carries a cache breakpoint.
// Per Anthropic's render order (tools -> system -> messages), this
// single breakpoint caches the tool definitions too.
body["system"] = [["type": "text", "text": systemText, "cache_control": ["type": "ephemeral"]]]
} }
if let temperature = request.temperature { if let temperature = request.temperature {
body["temperature"] = temperature body["temperature"] = temperature
@@ -675,6 +719,11 @@ class AnthropicProvider: AIProvider {
let usageDict = json["usage"] as? [String: Any] let usageDict = json["usage"] as? [String: Any]
let inputTokens = usageDict?["input_tokens"] as? Int ?? 0 let inputTokens = usageDict?["input_tokens"] as? Int ?? 0
let outputTokens = usageDict?["output_tokens"] as? Int ?? 0 let outputTokens = usageDict?["output_tokens"] as? Int ?? 0
let cacheCreationTokens = usageDict?["cache_creation_input_tokens"] as? Int
let cacheReadTokens = usageDict?["cache_read_input_tokens"] as? Int
if cacheCreationTokens != nil || cacheReadTokens != nil {
Log.api.info("Anthropic cache usage: input=\(inputTokens), created=\(cacheCreationTokens ?? 0), read=\(cacheReadTokens ?? 0)")
}
return ChatResponse( return ChatResponse(
id: id, id: id,
@@ -685,7 +734,9 @@ class AnthropicProvider: AIProvider {
usage: ChatResponse.Usage( usage: ChatResponse.Usage(
promptTokens: inputTokens, promptTokens: inputTokens,
completionTokens: outputTokens, completionTokens: outputTokens,
totalTokens: inputTokens + outputTokens totalTokens: inputTokens + outputTokens,
cacheCreationInputTokens: cacheCreationTokens,
cacheReadInputTokens: cacheReadTokens
), ),
created: Date(), created: Date(),
toolCalls: toolCalls.isEmpty ? nil : toolCalls toolCalls: toolCalls.isEmpty ? nil : toolCalls
+18
View File
@@ -48,6 +48,11 @@ struct OpenRouterChatRequest: Codable {
let toolChoice: String? let toolChoice: String?
let modalities: [String]? let modalities: [String]?
let reasoning: ReasoningAPIConfig? let reasoning: ReasoningAPIConfig?
let cacheControl: CacheControl?
struct CacheControl: Codable {
let type: String
}
struct APIMessage: Codable { struct APIMessage: Codable {
let role: String let role: String
@@ -138,6 +143,7 @@ struct OpenRouterChatRequest: Codable {
case toolChoice = "tool_choice" case toolChoice = "tool_choice"
case modalities case modalities
case reasoning case reasoning
case cacheControl = "cache_control"
} }
} }
@@ -225,11 +231,23 @@ struct OpenRouterChatResponse: Codable {
let promptTokens: Int let promptTokens: Int
let completionTokens: Int let completionTokens: Int
let totalTokens: Int let totalTokens: Int
let promptTokensDetails: PromptTokensDetails?
struct PromptTokensDetails: Codable {
let cachedTokens: Int?
let cacheWriteTokens: Int?
enum CodingKeys: String, CodingKey {
case cachedTokens = "cached_tokens"
case cacheWriteTokens = "cache_write_tokens"
}
}
enum CodingKeys: String, CodingKey { enum CodingKeys: String, CodingKey {
case promptTokens = "prompt_tokens" case promptTokens = "prompt_tokens"
case completionTokens = "completion_tokens" case completionTokens = "completion_tokens"
case totalTokens = "total_tokens" case totalTokens = "total_tokens"
case promptTokensDetails = "prompt_tokens_details"
} }
} }
} }
+29 -3
View File
@@ -198,6 +198,11 @@ class OpenRouterProvider: AIProvider {
} }
if let maxTokens = maxTokens { body["max_tokens"] = maxTokens } if let maxTokens = maxTokens { body["max_tokens"] = maxTokens }
if let temperature = temperature { body["temperature"] = temperature } if let temperature = temperature { body["temperature"] = temperature }
// Anthropic models require an explicit cache_control opt-in on OpenRouter;
// other providers cache automatically.
if model.hasPrefix("anthropic/") {
body["cache_control"] = ["type": "ephemeral"]
}
var urlRequest = URLRequest(url: url) var urlRequest = URLRequest(url: url)
urlRequest.httpMethod = "POST" urlRequest.httpMethod = "POST"
@@ -388,6 +393,12 @@ class OpenRouterProvider: AIProvider {
ReasoningAPIConfig(effort: $0.effort, exclude: $0.exclude ? true : nil) ReasoningAPIConfig(effort: $0.effort, exclude: $0.exclude ? true : nil)
} }
// Anthropic models require an explicit cache_control opt-in on OpenRouter;
// other providers (OpenAI, DeepSeek, Gemini, Grok, etc.) cache automatically.
let cacheControl: OpenRouterChatRequest.CacheControl? = effectiveModel.hasPrefix("anthropic/")
? .init(type: "ephemeral")
: nil
return OpenRouterChatRequest( return OpenRouterChatRequest(
model: effectiveModel, model: effectiveModel,
messages: apiMessages, messages: apiMessages,
@@ -398,7 +409,8 @@ class OpenRouterProvider: AIProvider {
tools: request.tools, tools: request.tools,
toolChoice: request.tools != nil ? "auto" : nil, toolChoice: request.tools != nil ? "auto" : nil,
modalities: request.imageGeneration ? ["text", "image"] : nil, modalities: request.imageGeneration ? ["text", "image"] : nil,
reasoning: reasoningConfig reasoning: reasoningConfig,
cacheControl: cacheControl
) )
} }
@@ -416,6 +428,11 @@ class OpenRouterProvider: AIProvider {
let allImages = topLevelImages + blockImages let allImages = topLevelImages + blockImages
let images: [Data]? = allImages.isEmpty ? nil : allImages let images: [Data]? = allImages.isEmpty ? nil : allImages
if let details = apiResponse.usage?.promptTokensDetails,
details.cachedTokens != nil || details.cacheWriteTokens != nil {
Log.api.info("OpenRouter cache usage: model=\(apiResponse.model), created=\(details.cacheWriteTokens ?? 0), read=\(details.cachedTokens ?? 0)")
}
return ChatResponse( return ChatResponse(
id: apiResponse.id, id: apiResponse.id,
model: apiResponse.model, model: apiResponse.model,
@@ -426,7 +443,9 @@ class OpenRouterProvider: AIProvider {
ChatResponse.Usage( ChatResponse.Usage(
promptTokens: usage.promptTokens, promptTokens: usage.promptTokens,
completionTokens: usage.completionTokens, completionTokens: usage.completionTokens,
totalTokens: usage.totalTokens totalTokens: usage.totalTokens,
cacheCreationInputTokens: usage.promptTokensDetails?.cacheWriteTokens,
cacheReadInputTokens: usage.promptTokensDetails?.cachedTokens
) )
}, },
created: Date(timeIntervalSince1970: TimeInterval(apiResponse.created)), created: Date(timeIntervalSince1970: TimeInterval(apiResponse.created)),
@@ -446,6 +465,11 @@ class OpenRouterProvider: AIProvider {
let allImages = topLevelImages + blockImages let allImages = topLevelImages + blockImages
let images: [Data]? = allImages.isEmpty ? nil : allImages let images: [Data]? = allImages.isEmpty ? nil : allImages
if let details = apiChunk.usage?.promptTokensDetails,
details.cachedTokens != nil || details.cacheWriteTokens != nil {
Log.api.info("OpenRouter stream cache usage: model=\(apiChunk.model), created=\(details.cacheWriteTokens ?? 0), read=\(details.cachedTokens ?? 0)")
}
return StreamChunk( return StreamChunk(
id: apiChunk.id, id: apiChunk.id,
model: apiChunk.model, model: apiChunk.model,
@@ -460,7 +484,9 @@ class OpenRouterProvider: AIProvider {
ChatResponse.Usage( ChatResponse.Usage(
promptTokens: usage.promptTokens, promptTokens: usage.promptTokens,
completionTokens: usage.completionTokens, completionTokens: usage.completionTokens,
totalTokens: usage.totalTokens totalTokens: usage.totalTokens,
cacheCreationInputTokens: usage.promptTokensDetails?.cacheWriteTokens,
cacheReadInputTokens: usage.promptTokensDetails?.cachedTokens
) )
} }
) )
+15 -12
View File
@@ -934,10 +934,7 @@ Don't narrate future actions ("Let me...") - just use the tools.
messages[index].tokens = usage.completionTokens messages[index].tokens = usage.completionTokens
if let model = selectedModel { if let model = selectedModel {
let hasPricing = model.pricing.prompt > 0 || model.pricing.completion > 0 let hasPricing = model.pricing.prompt > 0 || model.pricing.completion > 0
let cost: Double? = hasPricing let cost: Double? = hasPricing ? calculateCost(usage: usage, pricing: model.pricing) : nil
? (Double(usage.promptTokens) * model.pricing.prompt / 1_000_000) +
(Double(usage.completionTokens) * model.pricing.completion / 1_000_000)
: nil
messages[index].cost = cost messages[index].cost = cost
sessionStats.addMessage(inputTokens: usage.promptTokens, outputTokens: usage.completionTokens, cost: cost) sessionStats.addMessage(inputTokens: usage.promptTokens, outputTokens: usage.completionTokens, cost: cost)
} }
@@ -1001,10 +998,7 @@ Don't narrate future actions ("Let me...") - just use the tools.
messages[index].tokens = usage.completionTokens messages[index].tokens = usage.completionTokens
if let model = selectedModel { if let model = selectedModel {
let hasPricing = model.pricing.prompt > 0 || model.pricing.completion > 0 let hasPricing = model.pricing.prompt > 0 || model.pricing.completion > 0
let cost: Double? = hasPricing let cost: Double? = hasPricing ? calculateCost(usage: usage, pricing: model.pricing) : nil
? (Double(usage.promptTokens) * model.pricing.prompt / 1_000_000) +
(Double(usage.completionTokens) * model.pricing.completion / 1_000_000)
: nil
messages[index].cost = cost messages[index].cost = cost
sessionStats.addMessage(inputTokens: usage.promptTokens, outputTokens: usage.completionTokens, cost: cost) sessionStats.addMessage(inputTokens: usage.promptTokens, outputTokens: usage.completionTokens, cost: cost)
} }
@@ -1529,10 +1523,7 @@ Don't narrate future actions ("Let me...") - just use the tools.
// Calculate cost // Calculate cost
if let usage = totalUsage, let model = selectedModel { if let usage = totalUsage, let model = selectedModel {
let hasPricing = model.pricing.prompt > 0 || model.pricing.completion > 0 let hasPricing = model.pricing.prompt > 0 || model.pricing.completion > 0
let cost: Double? = hasPricing let cost: Double? = hasPricing ? calculateCost(usage: usage, pricing: model.pricing) : nil
? (Double(usage.promptTokens) * model.pricing.prompt / 1_000_000) +
(Double(usage.completionTokens) * model.pricing.completion / 1_000_000)
: nil
if let index = messages.lastIndex(where: { $0.id == assistantMessage.id }) { if let index = messages.lastIndex(where: { $0.id == assistantMessage.id }) {
messages[index].cost = cost messages[index].cost = cost
} }
@@ -2180,6 +2171,18 @@ Don't narrate future actions ("Let me...") - just use the tools.
} }
} }
/// Cost for one response's usage, accounting for Anthropic-style prompt-cache
/// pricing when present: cache writes cost 1.25x the base input rate, cache
/// reads cost 0.1x. `usage.promptTokens` is already the uncached remainder
/// it does not need cache tokens subtracted from it.
private func calculateCost(usage: ChatResponse.Usage, pricing: ModelInfo.Pricing) -> Double {
let inputCost = Double(usage.promptTokens) * pricing.prompt / 1_000_000
let cacheReadCost = Double(usage.cacheReadInputTokens ?? 0) * pricing.prompt * 0.1 / 1_000_000
let cacheWriteCost = Double(usage.cacheCreationInputTokens ?? 0) * pricing.prompt * 1.25 / 1_000_000
let outputCost = Double(usage.completionTokens) * pricing.completion / 1_000_000
return inputCost + cacheReadCost + cacheWriteCost + outputCost
}
/// Summarize a chunk of messages into a concise summary /// Summarize a chunk of messages into a concise summary
private func summarizeMessageChunk(_ messages: [Message]) async -> String? { private func summarizeMessageChunk(_ messages: [Message]) async -> String? {
guard let provider = providerRegistry.getProvider(for: currentProvider), guard let provider = providerRegistry.getProvider(for: currentProvider),