diff --git a/oAI/Providers/AIProvider.swift b/oAI/Providers/AIProvider.swift index da6d418..10f2d00 100644 --- a/oAI/Providers/AIProvider.swift +++ b/oAI/Providers/AIProvider.swift @@ -130,11 +130,23 @@ struct ChatResponse: Codable { let promptTokens: Int let completionTokens: Int let totalTokens: Int + let cacheCreationInputTokens: Int? + let cacheReadInputTokens: Int? + + init(promptTokens: Int, completionTokens: Int, totalTokens: Int, cacheCreationInputTokens: Int? = nil, cacheReadInputTokens: Int? = nil) { + self.promptTokens = promptTokens + self.completionTokens = completionTokens + self.totalTokens = totalTokens + self.cacheCreationInputTokens = cacheCreationInputTokens + self.cacheReadInputTokens = cacheReadInputTokens + } enum CodingKeys: String, CodingKey { case promptTokens = "prompt_tokens" case completionTokens = "completion_tokens" case totalTokens = "total_tokens" + case cacheCreationInputTokens = "cache_creation_input_tokens" + case cacheReadInputTokens = "cache_read_input_tokens" } } diff --git a/oAI/Providers/AnthropicProvider.swift b/oAI/Providers/AnthropicProvider.swift index 9ed37e4..d801467 100644 --- a/oAI/Providers/AnthropicProvider.swift +++ b/oAI/Providers/AnthropicProvider.swift @@ -366,6 +366,19 @@ class AnthropicProvider: AIProvider { } } + // Mark the last message with a cache breakpoint so the next loop + // iteration (or next turn) can reuse everything up through this one. + if var lastMessage = conversationMessages.popLast() { + if let content = lastMessage["content"] as? String { + lastMessage["content"] = [["type": "text", "text": content, "cache_control": ["type": "ephemeral"]]] + } else if var blocks = lastMessage["content"] as? [[String: Any]], var lastBlock = blocks.popLast() { + lastBlock["cache_control"] = ["type": "ephemeral"] + blocks.append(lastBlock) + lastMessage["content"] = blocks + } + conversationMessages.append(lastMessage) + } + var body: [String: Any] = [ "model": model, "messages": conversationMessages, @@ -373,7 +386,9 @@ class AnthropicProvider: AIProvider { "stream": false ] if let systemText = systemText { - body["system"] = systemText + // Array form carries a cache breakpoint; also covers tools, which + // render before system in Anthropic's prefix order. + body["system"] = [["type": "text", "text": systemText, "cache_control": ["type": "ephemeral"]]] } if let temperature = temperature { body["temperature"] = temperature @@ -440,6 +455,8 @@ class AnthropicProvider: AIProvider { var currentId = "" var currentModel = request.model var inputTokens = 0 + var cacheCreationTokens: Int? = nil + var cacheReadTokens: Int? = nil for try await line in bytes.lines { // Anthropic SSE: "event: ..." and "data: {...}" @@ -459,6 +476,11 @@ class AnthropicProvider: AIProvider { currentModel = message["model"] as? String ?? request.model if let usageDict = message["usage"] as? [String: Any] { inputTokens = usageDict["input_tokens"] as? Int ?? 0 + cacheCreationTokens = usageDict["cache_creation_input_tokens"] as? Int + cacheReadTokens = usageDict["cache_read_input_tokens"] as? Int + if cacheCreationTokens != nil || cacheReadTokens != nil { + Log.api.info("Anthropic stream cache usage: input=\(inputTokens), created=\(cacheCreationTokens ?? 0), read=\(cacheReadTokens ?? 0)") + } } } @@ -482,7 +504,13 @@ class AnthropicProvider: AIProvider { var usage: ChatResponse.Usage? = nil if let usageDict = event["usage"] as? [String: Any] { let outputTokens = usageDict["output_tokens"] as? Int ?? 0 - usage = ChatResponse.Usage(promptTokens: inputTokens, completionTokens: outputTokens, totalTokens: inputTokens + outputTokens) + usage = ChatResponse.Usage( + promptTokens: inputTokens, + completionTokens: outputTokens, + totalTokens: inputTokens + outputTokens, + cacheCreationInputTokens: cacheCreationTokens, + cacheReadInputTokens: cacheReadTokens + ) } continuation.yield(StreamChunk( id: currentId, @@ -592,6 +620,19 @@ class AnthropicProvider: AIProvider { } } + // Mark the last message with a cache breakpoint so the next turn can + // reuse everything up through this one as a cached prefix. + if var lastMessage = apiMessages.popLast() { + if let content = lastMessage["content"] as? String { + lastMessage["content"] = [["type": "text", "text": content, "cache_control": ["type": "ephemeral"]]] + } else if var blocks = lastMessage["content"] as? [[String: Any]], var lastBlock = blocks.popLast() { + lastBlock["cache_control"] = ["type": "ephemeral"] + blocks.append(lastBlock) + lastMessage["content"] = blocks + } + apiMessages.append(lastMessage) + } + var body: [String: Any] = [ "model": request.model, "messages": apiMessages, @@ -600,7 +641,10 @@ class AnthropicProvider: AIProvider { ] if let systemText = systemText { - body["system"] = systemText + // Array form (rather than a plain string) carries a cache breakpoint. + // Per Anthropic's render order (tools -> system -> messages), this + // single breakpoint caches the tool definitions too. + body["system"] = [["type": "text", "text": systemText, "cache_control": ["type": "ephemeral"]]] } if let temperature = request.temperature { body["temperature"] = temperature @@ -675,6 +719,11 @@ class AnthropicProvider: AIProvider { let usageDict = json["usage"] as? [String: Any] let inputTokens = usageDict?["input_tokens"] as? Int ?? 0 let outputTokens = usageDict?["output_tokens"] as? Int ?? 0 + let cacheCreationTokens = usageDict?["cache_creation_input_tokens"] as? Int + let cacheReadTokens = usageDict?["cache_read_input_tokens"] as? Int + if cacheCreationTokens != nil || cacheReadTokens != nil { + Log.api.info("Anthropic cache usage: input=\(inputTokens), created=\(cacheCreationTokens ?? 0), read=\(cacheReadTokens ?? 0)") + } return ChatResponse( id: id, @@ -685,7 +734,9 @@ class AnthropicProvider: AIProvider { usage: ChatResponse.Usage( promptTokens: inputTokens, completionTokens: outputTokens, - totalTokens: inputTokens + outputTokens + totalTokens: inputTokens + outputTokens, + cacheCreationInputTokens: cacheCreationTokens, + cacheReadInputTokens: cacheReadTokens ), created: Date(), toolCalls: toolCalls.isEmpty ? nil : toolCalls diff --git a/oAI/Providers/OpenRouterModels.swift b/oAI/Providers/OpenRouterModels.swift index 7701a60..4fbb6b3 100644 --- a/oAI/Providers/OpenRouterModels.swift +++ b/oAI/Providers/OpenRouterModels.swift @@ -48,7 +48,12 @@ struct OpenRouterChatRequest: Codable { let toolChoice: String? let modalities: [String]? let reasoning: ReasoningAPIConfig? - + let cacheControl: CacheControl? + + struct CacheControl: Codable { + let type: String + } + struct APIMessage: Codable { let role: String let content: MessageContent @@ -138,6 +143,7 @@ struct OpenRouterChatRequest: Codable { case toolChoice = "tool_choice" case modalities case reasoning + case cacheControl = "cache_control" } } @@ -225,11 +231,23 @@ struct OpenRouterChatResponse: Codable { let promptTokens: Int let completionTokens: Int let totalTokens: Int - + let promptTokensDetails: PromptTokensDetails? + + struct PromptTokensDetails: Codable { + let cachedTokens: Int? + let cacheWriteTokens: Int? + + enum CodingKeys: String, CodingKey { + case cachedTokens = "cached_tokens" + case cacheWriteTokens = "cache_write_tokens" + } + } + enum CodingKeys: String, CodingKey { case promptTokens = "prompt_tokens" case completionTokens = "completion_tokens" case totalTokens = "total_tokens" + case promptTokensDetails = "prompt_tokens_details" } } } diff --git a/oAI/Providers/OpenRouterProvider.swift b/oAI/Providers/OpenRouterProvider.swift index 18e7fed..bcc874a 100644 --- a/oAI/Providers/OpenRouterProvider.swift +++ b/oAI/Providers/OpenRouterProvider.swift @@ -198,6 +198,11 @@ class OpenRouterProvider: AIProvider { } if let maxTokens = maxTokens { body["max_tokens"] = maxTokens } if let temperature = temperature { body["temperature"] = temperature } + // Anthropic models require an explicit cache_control opt-in on OpenRouter; + // other providers cache automatically. + if model.hasPrefix("anthropic/") { + body["cache_control"] = ["type": "ephemeral"] + } var urlRequest = URLRequest(url: url) urlRequest.httpMethod = "POST" @@ -388,6 +393,12 @@ class OpenRouterProvider: AIProvider { ReasoningAPIConfig(effort: $0.effort, exclude: $0.exclude ? true : nil) } + // Anthropic models require an explicit cache_control opt-in on OpenRouter; + // other providers (OpenAI, DeepSeek, Gemini, Grok, etc.) cache automatically. + let cacheControl: OpenRouterChatRequest.CacheControl? = effectiveModel.hasPrefix("anthropic/") + ? .init(type: "ephemeral") + : nil + return OpenRouterChatRequest( model: effectiveModel, messages: apiMessages, @@ -398,7 +409,8 @@ class OpenRouterProvider: AIProvider { tools: request.tools, toolChoice: request.tools != nil ? "auto" : nil, modalities: request.imageGeneration ? ["text", "image"] : nil, - reasoning: reasoningConfig + reasoning: reasoningConfig, + cacheControl: cacheControl ) } @@ -416,6 +428,11 @@ class OpenRouterProvider: AIProvider { let allImages = topLevelImages + blockImages let images: [Data]? = allImages.isEmpty ? nil : allImages + if let details = apiResponse.usage?.promptTokensDetails, + details.cachedTokens != nil || details.cacheWriteTokens != nil { + Log.api.info("OpenRouter cache usage: model=\(apiResponse.model), created=\(details.cacheWriteTokens ?? 0), read=\(details.cachedTokens ?? 0)") + } + return ChatResponse( id: apiResponse.id, model: apiResponse.model, @@ -426,7 +443,9 @@ class OpenRouterProvider: AIProvider { ChatResponse.Usage( promptTokens: usage.promptTokens, completionTokens: usage.completionTokens, - totalTokens: usage.totalTokens + totalTokens: usage.totalTokens, + cacheCreationInputTokens: usage.promptTokensDetails?.cacheWriteTokens, + cacheReadInputTokens: usage.promptTokensDetails?.cachedTokens ) }, created: Date(timeIntervalSince1970: TimeInterval(apiResponse.created)), @@ -446,6 +465,11 @@ class OpenRouterProvider: AIProvider { let allImages = topLevelImages + blockImages let images: [Data]? = allImages.isEmpty ? nil : allImages + if let details = apiChunk.usage?.promptTokensDetails, + details.cachedTokens != nil || details.cacheWriteTokens != nil { + Log.api.info("OpenRouter stream cache usage: model=\(apiChunk.model), created=\(details.cacheWriteTokens ?? 0), read=\(details.cachedTokens ?? 0)") + } + return StreamChunk( id: apiChunk.id, model: apiChunk.model, @@ -460,7 +484,9 @@ class OpenRouterProvider: AIProvider { ChatResponse.Usage( promptTokens: usage.promptTokens, completionTokens: usage.completionTokens, - totalTokens: usage.totalTokens + totalTokens: usage.totalTokens, + cacheCreationInputTokens: usage.promptTokensDetails?.cacheWriteTokens, + cacheReadInputTokens: usage.promptTokensDetails?.cachedTokens ) } ) diff --git a/oAI/ViewModels/ChatViewModel.swift b/oAI/ViewModels/ChatViewModel.swift index d398f93..e217b0b 100644 --- a/oAI/ViewModels/ChatViewModel.swift +++ b/oAI/ViewModels/ChatViewModel.swift @@ -934,10 +934,7 @@ Don't narrate future actions ("Let me...") - just use the tools. messages[index].tokens = usage.completionTokens if let model = selectedModel { let hasPricing = model.pricing.prompt > 0 || model.pricing.completion > 0 - let cost: Double? = hasPricing - ? (Double(usage.promptTokens) * model.pricing.prompt / 1_000_000) + - (Double(usage.completionTokens) * model.pricing.completion / 1_000_000) - : nil + let cost: Double? = hasPricing ? calculateCost(usage: usage, pricing: model.pricing) : nil messages[index].cost = cost sessionStats.addMessage(inputTokens: usage.promptTokens, outputTokens: usage.completionTokens, cost: cost) } @@ -1001,10 +998,7 @@ Don't narrate future actions ("Let me...") - just use the tools. messages[index].tokens = usage.completionTokens if let model = selectedModel { let hasPricing = model.pricing.prompt > 0 || model.pricing.completion > 0 - let cost: Double? = hasPricing - ? (Double(usage.promptTokens) * model.pricing.prompt / 1_000_000) + - (Double(usage.completionTokens) * model.pricing.completion / 1_000_000) - : nil + let cost: Double? = hasPricing ? calculateCost(usage: usage, pricing: model.pricing) : nil messages[index].cost = cost sessionStats.addMessage(inputTokens: usage.promptTokens, outputTokens: usage.completionTokens, cost: cost) } @@ -1529,10 +1523,7 @@ Don't narrate future actions ("Let me...") - just use the tools. // Calculate cost if let usage = totalUsage, let model = selectedModel { let hasPricing = model.pricing.prompt > 0 || model.pricing.completion > 0 - let cost: Double? = hasPricing - ? (Double(usage.promptTokens) * model.pricing.prompt / 1_000_000) + - (Double(usage.completionTokens) * model.pricing.completion / 1_000_000) - : nil + let cost: Double? = hasPricing ? calculateCost(usage: usage, pricing: model.pricing) : nil if let index = messages.lastIndex(where: { $0.id == assistantMessage.id }) { messages[index].cost = cost } @@ -2180,6 +2171,18 @@ Don't narrate future actions ("Let me...") - just use the tools. } } + /// Cost for one response's usage, accounting for Anthropic-style prompt-cache + /// pricing when present: cache writes cost 1.25x the base input rate, cache + /// reads cost 0.1x. `usage.promptTokens` is already the uncached remainder — + /// it does not need cache tokens subtracted from it. + private func calculateCost(usage: ChatResponse.Usage, pricing: ModelInfo.Pricing) -> Double { + let inputCost = Double(usage.promptTokens) * pricing.prompt / 1_000_000 + let cacheReadCost = Double(usage.cacheReadInputTokens ?? 0) * pricing.prompt * 0.1 / 1_000_000 + let cacheWriteCost = Double(usage.cacheCreationInputTokens ?? 0) * pricing.prompt * 1.25 / 1_000_000 + let outputCost = Double(usage.completionTokens) * pricing.completion / 1_000_000 + return inputCost + cacheReadCost + cacheWriteCost + outputCost + } + /// Summarize a chunk of messages into a concise summary private func summarizeMessageChunk(_ messages: [Message]) async -> String? { guard let provider = providerRegistry.getProvider(for: currentProvider),