Add Anthropic prompt caching (direct + via OpenRouter)
Caches the system prompt/tools and growing conversation history via cache_control breakpoints, cutting cost and latency on repeated turns. Covers both the regular chat path and the tool-calling loop (chatWithToolMessages), which has its own request-building code and was initially missed. Cost calculation now accounts for cache write/read pricing instead of treating all input tokens as full price. Verified live: cache reads grow turn-over-turn in oAI.log. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -130,11 +130,23 @@ struct ChatResponse: Codable {
|
||||
let promptTokens: Int
|
||||
let completionTokens: Int
|
||||
let totalTokens: Int
|
||||
let cacheCreationInputTokens: Int?
|
||||
let cacheReadInputTokens: Int?
|
||||
|
||||
init(promptTokens: Int, completionTokens: Int, totalTokens: Int, cacheCreationInputTokens: Int? = nil, cacheReadInputTokens: Int? = nil) {
|
||||
self.promptTokens = promptTokens
|
||||
self.completionTokens = completionTokens
|
||||
self.totalTokens = totalTokens
|
||||
self.cacheCreationInputTokens = cacheCreationInputTokens
|
||||
self.cacheReadInputTokens = cacheReadInputTokens
|
||||
}
|
||||
|
||||
enum CodingKeys: String, CodingKey {
|
||||
case promptTokens = "prompt_tokens"
|
||||
case completionTokens = "completion_tokens"
|
||||
case totalTokens = "total_tokens"
|
||||
case cacheCreationInputTokens = "cache_creation_input_tokens"
|
||||
case cacheReadInputTokens = "cache_read_input_tokens"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -366,6 +366,19 @@ class AnthropicProvider: AIProvider {
|
||||
}
|
||||
}
|
||||
|
||||
// Mark the last message with a cache breakpoint so the next loop
|
||||
// iteration (or next turn) can reuse everything up through this one.
|
||||
if var lastMessage = conversationMessages.popLast() {
|
||||
if let content = lastMessage["content"] as? String {
|
||||
lastMessage["content"] = [["type": "text", "text": content, "cache_control": ["type": "ephemeral"]]]
|
||||
} else if var blocks = lastMessage["content"] as? [[String: Any]], var lastBlock = blocks.popLast() {
|
||||
lastBlock["cache_control"] = ["type": "ephemeral"]
|
||||
blocks.append(lastBlock)
|
||||
lastMessage["content"] = blocks
|
||||
}
|
||||
conversationMessages.append(lastMessage)
|
||||
}
|
||||
|
||||
var body: [String: Any] = [
|
||||
"model": model,
|
||||
"messages": conversationMessages,
|
||||
@@ -373,7 +386,9 @@ class AnthropicProvider: AIProvider {
|
||||
"stream": false
|
||||
]
|
||||
if let systemText = systemText {
|
||||
body["system"] = systemText
|
||||
// Array form carries a cache breakpoint; also covers tools, which
|
||||
// render before system in Anthropic's prefix order.
|
||||
body["system"] = [["type": "text", "text": systemText, "cache_control": ["type": "ephemeral"]]]
|
||||
}
|
||||
if let temperature = temperature {
|
||||
body["temperature"] = temperature
|
||||
@@ -440,6 +455,8 @@ class AnthropicProvider: AIProvider {
|
||||
var currentId = ""
|
||||
var currentModel = request.model
|
||||
var inputTokens = 0
|
||||
var cacheCreationTokens: Int? = nil
|
||||
var cacheReadTokens: Int? = nil
|
||||
|
||||
for try await line in bytes.lines {
|
||||
// Anthropic SSE: "event: ..." and "data: {...}"
|
||||
@@ -459,6 +476,11 @@ class AnthropicProvider: AIProvider {
|
||||
currentModel = message["model"] as? String ?? request.model
|
||||
if let usageDict = message["usage"] as? [String: Any] {
|
||||
inputTokens = usageDict["input_tokens"] as? Int ?? 0
|
||||
cacheCreationTokens = usageDict["cache_creation_input_tokens"] as? Int
|
||||
cacheReadTokens = usageDict["cache_read_input_tokens"] as? Int
|
||||
if cacheCreationTokens != nil || cacheReadTokens != nil {
|
||||
Log.api.info("Anthropic stream cache usage: input=\(inputTokens), created=\(cacheCreationTokens ?? 0), read=\(cacheReadTokens ?? 0)")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -482,7 +504,13 @@ class AnthropicProvider: AIProvider {
|
||||
var usage: ChatResponse.Usage? = nil
|
||||
if let usageDict = event["usage"] as? [String: Any] {
|
||||
let outputTokens = usageDict["output_tokens"] as? Int ?? 0
|
||||
usage = ChatResponse.Usage(promptTokens: inputTokens, completionTokens: outputTokens, totalTokens: inputTokens + outputTokens)
|
||||
usage = ChatResponse.Usage(
|
||||
promptTokens: inputTokens,
|
||||
completionTokens: outputTokens,
|
||||
totalTokens: inputTokens + outputTokens,
|
||||
cacheCreationInputTokens: cacheCreationTokens,
|
||||
cacheReadInputTokens: cacheReadTokens
|
||||
)
|
||||
}
|
||||
continuation.yield(StreamChunk(
|
||||
id: currentId,
|
||||
@@ -592,6 +620,19 @@ class AnthropicProvider: AIProvider {
|
||||
}
|
||||
}
|
||||
|
||||
// Mark the last message with a cache breakpoint so the next turn can
|
||||
// reuse everything up through this one as a cached prefix.
|
||||
if var lastMessage = apiMessages.popLast() {
|
||||
if let content = lastMessage["content"] as? String {
|
||||
lastMessage["content"] = [["type": "text", "text": content, "cache_control": ["type": "ephemeral"]]]
|
||||
} else if var blocks = lastMessage["content"] as? [[String: Any]], var lastBlock = blocks.popLast() {
|
||||
lastBlock["cache_control"] = ["type": "ephemeral"]
|
||||
blocks.append(lastBlock)
|
||||
lastMessage["content"] = blocks
|
||||
}
|
||||
apiMessages.append(lastMessage)
|
||||
}
|
||||
|
||||
var body: [String: Any] = [
|
||||
"model": request.model,
|
||||
"messages": apiMessages,
|
||||
@@ -600,7 +641,10 @@ class AnthropicProvider: AIProvider {
|
||||
]
|
||||
|
||||
if let systemText = systemText {
|
||||
body["system"] = systemText
|
||||
// Array form (rather than a plain string) carries a cache breakpoint.
|
||||
// Per Anthropic's render order (tools -> system -> messages), this
|
||||
// single breakpoint caches the tool definitions too.
|
||||
body["system"] = [["type": "text", "text": systemText, "cache_control": ["type": "ephemeral"]]]
|
||||
}
|
||||
if let temperature = request.temperature {
|
||||
body["temperature"] = temperature
|
||||
@@ -675,6 +719,11 @@ class AnthropicProvider: AIProvider {
|
||||
let usageDict = json["usage"] as? [String: Any]
|
||||
let inputTokens = usageDict?["input_tokens"] as? Int ?? 0
|
||||
let outputTokens = usageDict?["output_tokens"] as? Int ?? 0
|
||||
let cacheCreationTokens = usageDict?["cache_creation_input_tokens"] as? Int
|
||||
let cacheReadTokens = usageDict?["cache_read_input_tokens"] as? Int
|
||||
if cacheCreationTokens != nil || cacheReadTokens != nil {
|
||||
Log.api.info("Anthropic cache usage: input=\(inputTokens), created=\(cacheCreationTokens ?? 0), read=\(cacheReadTokens ?? 0)")
|
||||
}
|
||||
|
||||
return ChatResponse(
|
||||
id: id,
|
||||
@@ -685,7 +734,9 @@ class AnthropicProvider: AIProvider {
|
||||
usage: ChatResponse.Usage(
|
||||
promptTokens: inputTokens,
|
||||
completionTokens: outputTokens,
|
||||
totalTokens: inputTokens + outputTokens
|
||||
totalTokens: inputTokens + outputTokens,
|
||||
cacheCreationInputTokens: cacheCreationTokens,
|
||||
cacheReadInputTokens: cacheReadTokens
|
||||
),
|
||||
created: Date(),
|
||||
toolCalls: toolCalls.isEmpty ? nil : toolCalls
|
||||
|
||||
@@ -48,7 +48,12 @@ struct OpenRouterChatRequest: Codable {
|
||||
let toolChoice: String?
|
||||
let modalities: [String]?
|
||||
let reasoning: ReasoningAPIConfig?
|
||||
|
||||
let cacheControl: CacheControl?
|
||||
|
||||
struct CacheControl: Codable {
|
||||
let type: String
|
||||
}
|
||||
|
||||
struct APIMessage: Codable {
|
||||
let role: String
|
||||
let content: MessageContent
|
||||
@@ -138,6 +143,7 @@ struct OpenRouterChatRequest: Codable {
|
||||
case toolChoice = "tool_choice"
|
||||
case modalities
|
||||
case reasoning
|
||||
case cacheControl = "cache_control"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -225,11 +231,23 @@ struct OpenRouterChatResponse: Codable {
|
||||
let promptTokens: Int
|
||||
let completionTokens: Int
|
||||
let totalTokens: Int
|
||||
|
||||
let promptTokensDetails: PromptTokensDetails?
|
||||
|
||||
struct PromptTokensDetails: Codable {
|
||||
let cachedTokens: Int?
|
||||
let cacheWriteTokens: Int?
|
||||
|
||||
enum CodingKeys: String, CodingKey {
|
||||
case cachedTokens = "cached_tokens"
|
||||
case cacheWriteTokens = "cache_write_tokens"
|
||||
}
|
||||
}
|
||||
|
||||
enum CodingKeys: String, CodingKey {
|
||||
case promptTokens = "prompt_tokens"
|
||||
case completionTokens = "completion_tokens"
|
||||
case totalTokens = "total_tokens"
|
||||
case promptTokensDetails = "prompt_tokens_details"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -198,6 +198,11 @@ class OpenRouterProvider: AIProvider {
|
||||
}
|
||||
if let maxTokens = maxTokens { body["max_tokens"] = maxTokens }
|
||||
if let temperature = temperature { body["temperature"] = temperature }
|
||||
// Anthropic models require an explicit cache_control opt-in on OpenRouter;
|
||||
// other providers cache automatically.
|
||||
if model.hasPrefix("anthropic/") {
|
||||
body["cache_control"] = ["type": "ephemeral"]
|
||||
}
|
||||
|
||||
var urlRequest = URLRequest(url: url)
|
||||
urlRequest.httpMethod = "POST"
|
||||
@@ -388,6 +393,12 @@ class OpenRouterProvider: AIProvider {
|
||||
ReasoningAPIConfig(effort: $0.effort, exclude: $0.exclude ? true : nil)
|
||||
}
|
||||
|
||||
// Anthropic models require an explicit cache_control opt-in on OpenRouter;
|
||||
// other providers (OpenAI, DeepSeek, Gemini, Grok, etc.) cache automatically.
|
||||
let cacheControl: OpenRouterChatRequest.CacheControl? = effectiveModel.hasPrefix("anthropic/")
|
||||
? .init(type: "ephemeral")
|
||||
: nil
|
||||
|
||||
return OpenRouterChatRequest(
|
||||
model: effectiveModel,
|
||||
messages: apiMessages,
|
||||
@@ -398,7 +409,8 @@ class OpenRouterProvider: AIProvider {
|
||||
tools: request.tools,
|
||||
toolChoice: request.tools != nil ? "auto" : nil,
|
||||
modalities: request.imageGeneration ? ["text", "image"] : nil,
|
||||
reasoning: reasoningConfig
|
||||
reasoning: reasoningConfig,
|
||||
cacheControl: cacheControl
|
||||
)
|
||||
}
|
||||
|
||||
@@ -416,6 +428,11 @@ class OpenRouterProvider: AIProvider {
|
||||
let allImages = topLevelImages + blockImages
|
||||
let images: [Data]? = allImages.isEmpty ? nil : allImages
|
||||
|
||||
if let details = apiResponse.usage?.promptTokensDetails,
|
||||
details.cachedTokens != nil || details.cacheWriteTokens != nil {
|
||||
Log.api.info("OpenRouter cache usage: model=\(apiResponse.model), created=\(details.cacheWriteTokens ?? 0), read=\(details.cachedTokens ?? 0)")
|
||||
}
|
||||
|
||||
return ChatResponse(
|
||||
id: apiResponse.id,
|
||||
model: apiResponse.model,
|
||||
@@ -426,7 +443,9 @@ class OpenRouterProvider: AIProvider {
|
||||
ChatResponse.Usage(
|
||||
promptTokens: usage.promptTokens,
|
||||
completionTokens: usage.completionTokens,
|
||||
totalTokens: usage.totalTokens
|
||||
totalTokens: usage.totalTokens,
|
||||
cacheCreationInputTokens: usage.promptTokensDetails?.cacheWriteTokens,
|
||||
cacheReadInputTokens: usage.promptTokensDetails?.cachedTokens
|
||||
)
|
||||
},
|
||||
created: Date(timeIntervalSince1970: TimeInterval(apiResponse.created)),
|
||||
@@ -446,6 +465,11 @@ class OpenRouterProvider: AIProvider {
|
||||
let allImages = topLevelImages + blockImages
|
||||
let images: [Data]? = allImages.isEmpty ? nil : allImages
|
||||
|
||||
if let details = apiChunk.usage?.promptTokensDetails,
|
||||
details.cachedTokens != nil || details.cacheWriteTokens != nil {
|
||||
Log.api.info("OpenRouter stream cache usage: model=\(apiChunk.model), created=\(details.cacheWriteTokens ?? 0), read=\(details.cachedTokens ?? 0)")
|
||||
}
|
||||
|
||||
return StreamChunk(
|
||||
id: apiChunk.id,
|
||||
model: apiChunk.model,
|
||||
@@ -460,7 +484,9 @@ class OpenRouterProvider: AIProvider {
|
||||
ChatResponse.Usage(
|
||||
promptTokens: usage.promptTokens,
|
||||
completionTokens: usage.completionTokens,
|
||||
totalTokens: usage.totalTokens
|
||||
totalTokens: usage.totalTokens,
|
||||
cacheCreationInputTokens: usage.promptTokensDetails?.cacheWriteTokens,
|
||||
cacheReadInputTokens: usage.promptTokensDetails?.cachedTokens
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user