first commit
This commit is contained in:
3
src/__init__.py
Normal file
3
src/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
"""News Agent - AI-powered daily tech news aggregator"""
|
||||
|
||||
__version__ = "0.1.0"
|
||||
1
src/aggregator/__init__.py
Normal file
1
src/aggregator/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""News aggregation from various sources"""
|
||||
162
src/aggregator/rss_fetcher.py
Normal file
162
src/aggregator/rss_fetcher.py
Normal file
@@ -0,0 +1,162 @@
|
||||
"""RSS feed fetching and parsing"""
|
||||
|
||||
import feedparser
|
||||
import httpx
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from hashlib import sha256
|
||||
from typing import Optional
|
||||
from email.utils import parsedate_to_datetime
|
||||
|
||||
from ..config import RSSSource
|
||||
from ..storage.models import Article
|
||||
from ..logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
class RSSFetcher:
|
||||
"""Fetch and parse RSS feeds"""
|
||||
|
||||
def __init__(self, timeout: int = 30, hours_lookback: int = 24):
|
||||
"""
|
||||
Initialize RSS fetcher
|
||||
|
||||
Args:
|
||||
timeout: HTTP request timeout in seconds
|
||||
hours_lookback: How many hours back to fetch articles
|
||||
"""
|
||||
self.timeout = timeout
|
||||
self.hours_lookback = hours_lookback
|
||||
self.client = httpx.AsyncClient(
|
||||
timeout=timeout,
|
||||
follow_redirects=True,
|
||||
headers={"User-Agent": "NewsAgent/1.0 RSS Reader"},
|
||||
)
|
||||
|
||||
async def close(self):
|
||||
"""Close HTTP client"""
|
||||
await self.client.aclose()
|
||||
|
||||
async def fetch(self, source: RSSSource) -> list[Article]:
|
||||
"""
|
||||
Fetch and parse articles from RSS feed
|
||||
|
||||
Args:
|
||||
source: RSS source configuration
|
||||
|
||||
Returns:
|
||||
List of Article objects from the feed
|
||||
"""
|
||||
try:
|
||||
logger.info(f"Fetching RSS feed: {source.name}")
|
||||
response = await self.client.get(str(source.url))
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse feed
|
||||
feed = feedparser.parse(response.text)
|
||||
|
||||
if feed.bozo:
|
||||
logger.warning(f"Feed parsing warning for {source.name}: {feed.bozo_exception}")
|
||||
|
||||
articles = []
|
||||
cutoff_time = datetime.now(timezone.utc) - timedelta(hours=self.hours_lookback)
|
||||
|
||||
for entry in feed.entries:
|
||||
try:
|
||||
article = self._parse_entry(entry, source)
|
||||
if article and article.published >= cutoff_time:
|
||||
articles.append(article)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to parse entry from {source.name}: {e}")
|
||||
continue
|
||||
|
||||
logger.info(f"Fetched {len(articles)} articles from {source.name}")
|
||||
return articles
|
||||
|
||||
except httpx.HTTPError as e:
|
||||
logger.error(f"HTTP error fetching {source.name}: {e}")
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error fetching {source.name}: {e}")
|
||||
return []
|
||||
|
||||
def _parse_entry(
|
||||
self, entry: feedparser.FeedParserDict, source: RSSSource
|
||||
) -> Optional[Article]:
|
||||
"""Parse a single RSS entry into an Article"""
|
||||
# Get URL
|
||||
url = entry.get("link")
|
||||
if not url:
|
||||
return None
|
||||
|
||||
# Generate unique ID from URL
|
||||
article_id = sha256(url.encode()).hexdigest()
|
||||
|
||||
# Get title
|
||||
title = entry.get("title", "Untitled")
|
||||
|
||||
# Get published date
|
||||
published = self._parse_date(entry)
|
||||
if not published:
|
||||
published = datetime.now(timezone.utc)
|
||||
|
||||
# Get summary/content
|
||||
summary = entry.get("summary", None)
|
||||
content = entry.get("content", [{}])[0].get("value", entry.get("description", title))
|
||||
|
||||
# Remove HTML tags from content if needed (basic cleanup)
|
||||
if content == title:
|
||||
content = summary or title
|
||||
|
||||
return Article(
|
||||
id=article_id,
|
||||
url=url,
|
||||
title=title,
|
||||
summary=summary,
|
||||
content=content,
|
||||
published=published,
|
||||
source=source.name,
|
||||
category=source.category,
|
||||
fetched_at=datetime.now(timezone.utc),
|
||||
)
|
||||
|
||||
def _parse_date(self, entry: feedparser.FeedParserDict) -> Optional[datetime]:
|
||||
"""Parse published date from entry"""
|
||||
# Try different date fields
|
||||
for date_field in ["published_parsed", "updated_parsed", "created_parsed"]:
|
||||
if date_field in entry and entry[date_field]:
|
||||
try:
|
||||
time_tuple = entry[date_field]
|
||||
dt = datetime(*time_tuple[:6], tzinfo=timezone.utc)
|
||||
return dt
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
|
||||
# Try parsing date strings
|
||||
for date_field in ["published", "updated", "created"]:
|
||||
if date_field in entry and entry[date_field]:
|
||||
try:
|
||||
return parsedate_to_datetime(entry[date_field])
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
|
||||
return None
|
||||
|
||||
async def fetch_all(self, sources: list[RSSSource]) -> list[Article]:
|
||||
"""
|
||||
Fetch articles from multiple RSS sources concurrently
|
||||
|
||||
Args:
|
||||
sources: List of RSS sources to fetch
|
||||
|
||||
Returns:
|
||||
Combined list of articles from all sources
|
||||
"""
|
||||
all_articles = []
|
||||
|
||||
for source in sources:
|
||||
articles = await self.fetch(source)
|
||||
all_articles.extend(articles)
|
||||
|
||||
logger.info(f"Total articles fetched from all sources: {len(all_articles)}")
|
||||
return all_articles
|
||||
1
src/ai/__init__.py
Normal file
1
src/ai/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""AI processing using OpenRouter"""
|
||||
117
src/ai/client.py
Normal file
117
src/ai/client.py
Normal file
@@ -0,0 +1,117 @@
|
||||
"""OpenRouter API client"""
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
from ..config import get_config
|
||||
from ..logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
class OpenRouterClient:
|
||||
"""Async client for OpenRouter API"""
|
||||
|
||||
def __init__(self):
|
||||
config = get_config()
|
||||
env = config.env
|
||||
|
||||
self.client = AsyncOpenAI(
|
||||
base_url=config.ai.base_url,
|
||||
api_key=env.openrouter_api_key,
|
||||
default_headers={
|
||||
**({"HTTP-Referer": env.openrouter_site_url} if env.openrouter_site_url else {}),
|
||||
**({"X-Title": env.openrouter_site_name} if env.openrouter_site_name else {}),
|
||||
},
|
||||
)
|
||||
|
||||
self.model = config.ai.model
|
||||
logger.info(f"Initialized OpenRouter client with model: {self.model}")
|
||||
|
||||
async def chat_completion(
|
||||
self,
|
||||
system_prompt: str,
|
||||
user_prompt: str,
|
||||
temperature: float = 0.7,
|
||||
max_tokens: int = 1000,
|
||||
json_mode: bool = False,
|
||||
) -> str:
|
||||
"""
|
||||
Send chat completion request
|
||||
|
||||
Args:
|
||||
system_prompt: System instruction
|
||||
user_prompt: User message
|
||||
temperature: Sampling temperature (0-2)
|
||||
max_tokens: Maximum tokens to generate
|
||||
json_mode: Enable JSON response format
|
||||
|
||||
Returns:
|
||||
Response content as string
|
||||
"""
|
||||
try:
|
||||
messages = [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
]
|
||||
|
||||
kwargs: dict[str, Any] = {
|
||||
"model": self.model,
|
||||
"messages": messages,
|
||||
"temperature": temperature,
|
||||
"max_tokens": max_tokens,
|
||||
}
|
||||
|
||||
if json_mode:
|
||||
kwargs["response_format"] = {"type": "json_object"}
|
||||
|
||||
response = await self.client.chat.completions.create(**kwargs)
|
||||
|
||||
content = response.choices[0].message.content
|
||||
if not content:
|
||||
raise ValueError("Empty response from API")
|
||||
|
||||
# Log token usage
|
||||
if response.usage:
|
||||
logger.debug(
|
||||
f"Tokens used - Prompt: {response.usage.prompt_tokens}, "
|
||||
f"Completion: {response.usage.completion_tokens}, "
|
||||
f"Total: {response.usage.total_tokens}"
|
||||
)
|
||||
|
||||
return content
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"OpenRouter API error: {e}")
|
||||
raise
|
||||
|
||||
async def chat_completion_json(
|
||||
self, system_prompt: str, user_prompt: str, temperature: float = 0.3, max_tokens: int = 500
|
||||
) -> dict[str, Any]:
|
||||
"""
|
||||
Send chat completion request expecting JSON response
|
||||
|
||||
Args:
|
||||
system_prompt: System instruction
|
||||
user_prompt: User message
|
||||
temperature: Sampling temperature
|
||||
max_tokens: Maximum tokens to generate
|
||||
|
||||
Returns:
|
||||
Parsed JSON response as dictionary
|
||||
"""
|
||||
content = await self.chat_completion(
|
||||
system_prompt=system_prompt,
|
||||
user_prompt=user_prompt,
|
||||
temperature=temperature,
|
||||
max_tokens=max_tokens,
|
||||
json_mode=True,
|
||||
)
|
||||
|
||||
try:
|
||||
return json.loads(content)
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse JSON response: {content}")
|
||||
raise ValueError(f"Invalid JSON response: {e}")
|
||||
118
src/ai/filter.py
Normal file
118
src/ai/filter.py
Normal file
@@ -0,0 +1,118 @@
|
||||
"""Article relevance filtering using AI"""
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ..storage.models import Article
|
||||
from ..config import get_config
|
||||
from ..logger import get_logger
|
||||
from .client import OpenRouterClient
|
||||
from .prompts import FILTERING_SYSTEM_PROMPT, FILTERING_USER_PROMPT
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
class ArticleFilter:
|
||||
"""Filter articles by relevance using AI"""
|
||||
|
||||
def __init__(self, client: OpenRouterClient):
|
||||
self.client = client
|
||||
config = get_config()
|
||||
self.interests = config.ai.interests
|
||||
self.min_score = config.ai.filtering.min_score
|
||||
|
||||
async def score_article(self, article: Article) -> Optional[float]:
|
||||
"""
|
||||
Score article relevance (0-10)
|
||||
|
||||
Args:
|
||||
article: Article to score
|
||||
|
||||
Returns:
|
||||
Relevance score or None if scoring fails
|
||||
"""
|
||||
try:
|
||||
# Prepare prompts
|
||||
system_prompt = FILTERING_SYSTEM_PROMPT.format(
|
||||
interests="\n".join(f"- {interest}" for interest in self.interests)
|
||||
)
|
||||
|
||||
# Truncate content for API call
|
||||
content_preview = article.content[:500] + ("..." if len(article.content) > 500 else "")
|
||||
|
||||
user_prompt = FILTERING_USER_PROMPT.format(
|
||||
title=article.title,
|
||||
source=article.source,
|
||||
category=article.category,
|
||||
content=content_preview,
|
||||
)
|
||||
|
||||
# Get score from AI
|
||||
response = await self.client.chat_completion_json(
|
||||
system_prompt=system_prompt,
|
||||
user_prompt=user_prompt,
|
||||
temperature=0.3,
|
||||
max_tokens=200,
|
||||
)
|
||||
|
||||
score = float(response.get("score", 0))
|
||||
reason = response.get("reason", "No reason provided")
|
||||
|
||||
logger.debug(f"Article '{article.title}' scored {score:.1f}: {reason}")
|
||||
|
||||
return score
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to score article '{article.title}': {e}")
|
||||
return None
|
||||
|
||||
async def is_relevant(self, article: Article) -> tuple[bool, Optional[float]]:
|
||||
"""
|
||||
Check if article meets relevance threshold
|
||||
|
||||
Args:
|
||||
article: Article to check
|
||||
|
||||
Returns:
|
||||
Tuple of (is_relevant, score)
|
||||
"""
|
||||
score = await self.score_article(article)
|
||||
|
||||
if score is None:
|
||||
return False, None
|
||||
|
||||
is_relevant = score >= self.min_score
|
||||
return is_relevant, score
|
||||
|
||||
async def filter_articles(
|
||||
self, articles: list[Article], max_articles: Optional[int] = None
|
||||
) -> list[tuple[Article, float]]:
|
||||
"""
|
||||
Filter and rank articles by relevance
|
||||
|
||||
Args:
|
||||
articles: Articles to filter
|
||||
max_articles: Maximum number of articles to return
|
||||
|
||||
Returns:
|
||||
List of (article, score) tuples, sorted by score descending
|
||||
"""
|
||||
scored_articles: list[tuple[Article, float]] = []
|
||||
|
||||
for article in articles:
|
||||
is_relevant, score = await self.is_relevant(article)
|
||||
|
||||
if is_relevant and score is not None:
|
||||
scored_articles.append((article, score))
|
||||
|
||||
# Sort by score descending
|
||||
scored_articles.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
# Apply limit if specified
|
||||
if max_articles:
|
||||
scored_articles = scored_articles[:max_articles]
|
||||
|
||||
logger.info(
|
||||
f"Filtered {len(articles)} articles down to {len(scored_articles)} relevant ones"
|
||||
)
|
||||
|
||||
return scored_articles
|
||||
60
src/ai/prompts.py
Normal file
60
src/ai/prompts.py
Normal file
@@ -0,0 +1,60 @@
|
||||
"""Prompt templates for AI processing"""
|
||||
|
||||
FILTERING_SYSTEM_PROMPT = """You are a news relevance analyzer. Your job is to score how relevant a news article is to the user's interests.
|
||||
|
||||
User Interests:
|
||||
{interests}
|
||||
|
||||
Score the article on a scale of 0-10 based on:
|
||||
- Direct relevance to stated interests (0-4 points)
|
||||
- Quality and depth of content (0-3 points)
|
||||
- Timeliness and importance (0-3 points)
|
||||
|
||||
Return ONLY a JSON object with this exact format:
|
||||
{{"score": <float>, "reason": "<brief explanation>"}}
|
||||
|
||||
Be strict - only highly relevant articles should score above 7.0."""
|
||||
|
||||
FILTERING_USER_PROMPT = """Article Title: {title}
|
||||
|
||||
Source: {source}
|
||||
|
||||
Category: {category}
|
||||
|
||||
Content Preview: {content}
|
||||
|
||||
Score this article's relevance (0-10) and explain why."""
|
||||
|
||||
|
||||
SUMMARIZATION_SYSTEM_PROMPT = """You are a technical news summarizer. Create concise, informative summaries of tech articles.
|
||||
|
||||
Guidelines:
|
||||
- Focus on key facts, findings, and implications
|
||||
- Include important technical details
|
||||
- Keep summaries to 2-3 sentences
|
||||
- Use clear, professional language
|
||||
- Highlight what makes this newsworthy
|
||||
|
||||
Return ONLY the summary text, no additional formatting."""
|
||||
|
||||
SUMMARIZATION_USER_PROMPT = """Title: {title}
|
||||
|
||||
Source: {source}
|
||||
|
||||
Content: {content}
|
||||
|
||||
Write a concise 2-3 sentence summary highlighting the key information."""
|
||||
|
||||
|
||||
BATCH_SUMMARIZATION_SYSTEM_PROMPT = """You are a technical news summarizer. Create concise summaries for multiple articles.
|
||||
|
||||
For each article, provide a 2-3 sentence summary that:
|
||||
- Captures the key facts and findings
|
||||
- Highlights technical details
|
||||
- Explains why it's newsworthy
|
||||
|
||||
Return a JSON array with this exact format:
|
||||
[
|
||||
{{"id": "<article_id>", "summary": "<2-3 sentence summary>"}},
|
||||
...
|
||||
]"""
|
||||
72
src/ai/summarizer.py
Normal file
72
src/ai/summarizer.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""Article summarization using AI"""
|
||||
|
||||
from ..storage.models import Article
|
||||
from ..logger import get_logger
|
||||
from .client import OpenRouterClient
|
||||
from .prompts import SUMMARIZATION_SYSTEM_PROMPT, SUMMARIZATION_USER_PROMPT
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
class ArticleSummarizer:
|
||||
"""Summarize articles using AI"""
|
||||
|
||||
def __init__(self, client: OpenRouterClient):
|
||||
self.client = client
|
||||
|
||||
async def summarize(self, article: Article) -> str:
|
||||
"""
|
||||
Generate concise summary of article
|
||||
|
||||
Args:
|
||||
article: Article to summarize
|
||||
|
||||
Returns:
|
||||
Summary text (2-3 sentences)
|
||||
"""
|
||||
try:
|
||||
# Truncate content if too long
|
||||
max_content_length = 2000
|
||||
content = article.content
|
||||
if len(content) > max_content_length:
|
||||
content = content[:max_content_length] + "..."
|
||||
|
||||
user_prompt = SUMMARIZATION_USER_PROMPT.format(
|
||||
title=article.title, source=article.source, content=content
|
||||
)
|
||||
|
||||
summary = await self.client.chat_completion(
|
||||
system_prompt=SUMMARIZATION_SYSTEM_PROMPT,
|
||||
user_prompt=user_prompt,
|
||||
temperature=0.5,
|
||||
max_tokens=300,
|
||||
)
|
||||
|
||||
logger.debug(f"Summarized article: {article.title}")
|
||||
return summary.strip()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to summarize article '{article.title}': {e}")
|
||||
# Fallback to original summary or truncated content
|
||||
if article.summary:
|
||||
return article.summary
|
||||
return article.content[:200] + "..."
|
||||
|
||||
async def summarize_batch(self, articles: list[Article]) -> dict[str, str]:
|
||||
"""
|
||||
Summarize multiple articles
|
||||
|
||||
Args:
|
||||
articles: List of articles to summarize
|
||||
|
||||
Returns:
|
||||
Dictionary mapping article IDs to summaries
|
||||
"""
|
||||
summaries = {}
|
||||
|
||||
for article in articles:
|
||||
summary = await self.summarize(article)
|
||||
summaries[article.id] = summary
|
||||
|
||||
logger.info(f"Summarized {len(summaries)} articles")
|
||||
return summaries
|
||||
156
src/config.py
Normal file
156
src/config.py
Normal file
@@ -0,0 +1,156 @@
|
||||
"""Configuration management for News Agent"""
|
||||
|
||||
import yaml
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, Field, HttpUrl
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
class RSSSource(BaseModel):
|
||||
"""RSS feed source configuration"""
|
||||
|
||||
name: str
|
||||
url: HttpUrl
|
||||
category: str
|
||||
|
||||
|
||||
class AIFilteringConfig(BaseModel):
|
||||
"""AI filtering configuration"""
|
||||
|
||||
enabled: bool = True
|
||||
min_score: float = Field(ge=0, le=10, default=6.5)
|
||||
max_articles: int = Field(ge=1, default=15)
|
||||
|
||||
|
||||
class AIConfig(BaseModel):
|
||||
"""AI processing configuration"""
|
||||
|
||||
provider: str = "openrouter"
|
||||
base_url: str = "https://openrouter.ai/api/v1"
|
||||
model: str = "google/gemini-flash-1.5"
|
||||
filtering: AIFilteringConfig
|
||||
interests: list[str]
|
||||
|
||||
|
||||
class SMTPConfig(BaseModel):
|
||||
"""SMTP server configuration"""
|
||||
|
||||
host: str = "localhost"
|
||||
port: int = 25
|
||||
use_tls: bool = False
|
||||
use_ssl: bool = False
|
||||
username: str | None = None
|
||||
password: str | None = None
|
||||
|
||||
|
||||
class EmailConfig(BaseModel):
|
||||
"""Email configuration"""
|
||||
|
||||
to: str
|
||||
from_: str = Field(alias="from")
|
||||
from_name: str = "Daily Tech News Agent"
|
||||
subject_template: str = "Tech News Digest - {date}"
|
||||
smtp: SMTPConfig
|
||||
|
||||
|
||||
class ScheduleConfig(BaseModel):
|
||||
"""Schedule configuration"""
|
||||
|
||||
time: str = "07:00"
|
||||
timezone: str = "Europe/Oslo"
|
||||
|
||||
|
||||
class DatabaseConfig(BaseModel):
|
||||
"""Database configuration"""
|
||||
|
||||
path: str = "data/articles.db"
|
||||
retention_days: int = 30
|
||||
|
||||
|
||||
class LoggingConfig(BaseModel):
|
||||
"""Logging configuration"""
|
||||
|
||||
level: str = "INFO"
|
||||
file: str = "data/logs/news-agent.log"
|
||||
max_bytes: int = 10485760
|
||||
backup_count: int = 5
|
||||
|
||||
|
||||
class EnvSettings(BaseSettings):
|
||||
"""Environment variable settings"""
|
||||
|
||||
model_config = SettingsConfigDict(env_file=".env", case_sensitive=False, extra="ignore")
|
||||
|
||||
openrouter_api_key: str
|
||||
openrouter_site_url: str | None = None
|
||||
openrouter_site_name: str | None = None
|
||||
smtp_username: str | None = None
|
||||
smtp_password: str | None = None
|
||||
error_notification_email: str | None = None
|
||||
|
||||
|
||||
class Config:
|
||||
"""Main configuration manager"""
|
||||
|
||||
def __init__(self, config_path: str | Path = "config.yaml"):
|
||||
"""Load configuration from YAML file and environment variables"""
|
||||
self.config_path = Path(config_path)
|
||||
|
||||
# Load YAML configuration
|
||||
with open(self.config_path) as f:
|
||||
self._config: dict[str, Any] = yaml.safe_load(f)
|
||||
|
||||
# Load environment variables
|
||||
self.env = EnvSettings()
|
||||
|
||||
@property
|
||||
def rss_sources(self) -> list[RSSSource]:
|
||||
"""Get all RSS sources"""
|
||||
return [RSSSource(**src) for src in self._config["sources"]["rss"]]
|
||||
|
||||
@property
|
||||
def ai(self) -> AIConfig:
|
||||
"""Get AI configuration"""
|
||||
return AIConfig(**self._config["ai"])
|
||||
|
||||
@property
|
||||
def email(self) -> EmailConfig:
|
||||
"""Get email configuration"""
|
||||
email_config = self._config["email"].copy()
|
||||
|
||||
# Merge SMTP credentials from environment variables
|
||||
if self.env.smtp_username:
|
||||
email_config["smtp"]["username"] = self.env.smtp_username
|
||||
if self.env.smtp_password:
|
||||
email_config["smtp"]["password"] = self.env.smtp_password
|
||||
|
||||
return EmailConfig(**email_config)
|
||||
|
||||
@property
|
||||
def schedule(self) -> ScheduleConfig:
|
||||
"""Get schedule configuration"""
|
||||
return ScheduleConfig(**self._config["schedule"])
|
||||
|
||||
@property
|
||||
def database(self) -> DatabaseConfig:
|
||||
"""Get database configuration"""
|
||||
return DatabaseConfig(**self._config["database"])
|
||||
|
||||
@property
|
||||
def logging(self) -> LoggingConfig:
|
||||
"""Get logging configuration"""
|
||||
return LoggingConfig(**self._config["logging"])
|
||||
|
||||
|
||||
# Global config instance (lazy loaded)
|
||||
_config: Config | None = None
|
||||
|
||||
|
||||
def get_config() -> Config:
|
||||
"""Get or create global config instance"""
|
||||
global _config
|
||||
if _config is None:
|
||||
_config = Config()
|
||||
return _config
|
||||
1
src/email/__init__.py
Normal file
1
src/email/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Email generation and sending"""
|
||||
114
src/email/generator.py
Normal file
114
src/email/generator.py
Normal file
@@ -0,0 +1,114 @@
|
||||
"""Email HTML generation from digest data"""
|
||||
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from premailer import transform
|
||||
|
||||
from ..storage.models import DigestEntry
|
||||
from ..logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
class EmailGenerator:
|
||||
"""Generate HTML emails from digest data"""
|
||||
|
||||
def __init__(self):
|
||||
# Set up Jinja2 template environment
|
||||
template_dir = Path(__file__).parent / "templates"
|
||||
self.env = Environment(loader=FileSystemLoader(template_dir))
|
||||
|
||||
def generate_digest_email(
|
||||
self, entries: list[DigestEntry], date_str: str, subject: str
|
||||
) -> tuple[str, str]:
|
||||
"""
|
||||
Generate HTML email for daily digest
|
||||
|
||||
Args:
|
||||
entries: List of digest entries (articles with summaries)
|
||||
date_str: Date string for the digest
|
||||
subject: Email subject line
|
||||
|
||||
Returns:
|
||||
Tuple of (html_content, text_content)
|
||||
"""
|
||||
# Group articles by category
|
||||
articles_by_category = defaultdict(list)
|
||||
for entry in entries:
|
||||
articles_by_category[entry.category].append(entry)
|
||||
|
||||
# Sort categories
|
||||
sorted_categories = sorted(articles_by_category.keys())
|
||||
|
||||
# Get unique sources count
|
||||
unique_sources = len(set(entry.article.source for entry in entries))
|
||||
|
||||
# Prepare template data
|
||||
template_data = {
|
||||
"title": subject,
|
||||
"date": date_str,
|
||||
"total_articles": len(entries),
|
||||
"total_sources": unique_sources,
|
||||
"total_categories": len(sorted_categories),
|
||||
"articles_by_category": {cat: articles_by_category[cat] for cat in sorted_categories},
|
||||
}
|
||||
|
||||
# Render HTML template
|
||||
template = self.env.get_template("daily_digest.html")
|
||||
html = template.render(**template_data)
|
||||
|
||||
# Inline CSS for email compatibility
|
||||
html_inlined = transform(html)
|
||||
|
||||
# Generate plain text version
|
||||
text = self._generate_text_version(entries, date_str, subject)
|
||||
|
||||
logger.info(f"Generated email with {len(entries)} articles")
|
||||
|
||||
return html_inlined, text
|
||||
|
||||
def _generate_text_version(
|
||||
self, entries: list[DigestEntry], date_str: str, subject: str
|
||||
) -> str:
|
||||
"""Generate plain text version of email"""
|
||||
lines = [
|
||||
subject,
|
||||
"=" * len(subject),
|
||||
"",
|
||||
f"Date: {date_str}",
|
||||
f"Total Articles: {len(entries)}",
|
||||
"",
|
||||
"",
|
||||
]
|
||||
|
||||
# Group by category
|
||||
articles_by_category = defaultdict(list)
|
||||
for entry in entries:
|
||||
articles_by_category[entry.category].append(entry)
|
||||
|
||||
# Output each category
|
||||
for category in sorted(articles_by_category.keys()):
|
||||
lines.append(f"{category.upper()}")
|
||||
lines.append("-" * len(category))
|
||||
lines.append("")
|
||||
|
||||
for entry in articles_by_category[category]:
|
||||
article = entry.article
|
||||
lines.append(f"• {article.title}")
|
||||
lines.append(f" Source: {article.source}")
|
||||
lines.append(f" Published: {article.published.strftime('%B %d, %Y at %H:%M')}")
|
||||
lines.append(f" Relevance: {entry.relevance_score:.1f}/10")
|
||||
lines.append(f" URL: {article.url}")
|
||||
lines.append(f" Summary: {entry.ai_summary}")
|
||||
lines.append("")
|
||||
|
||||
lines.append("")
|
||||
|
||||
lines.append("")
|
||||
lines.append("---")
|
||||
lines.append("Generated by News Agent | Powered by OpenRouter AI")
|
||||
|
||||
return "\n".join(lines)
|
||||
77
src/email/sender.py
Normal file
77
src/email/sender.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""Email sending via SMTP"""
|
||||
|
||||
import smtplib
|
||||
from email.mime.text import MIMEText
|
||||
from email.mime.multipart import MIMEMultipart
|
||||
from email.utils import formatdate
|
||||
|
||||
from ..config import get_config
|
||||
from ..logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
class EmailSender:
|
||||
"""Send emails via SMTP"""
|
||||
|
||||
def __init__(self):
|
||||
config = get_config()
|
||||
self.config = config.email
|
||||
|
||||
def send(self, subject: str, html_content: str, text_content: str) -> bool:
|
||||
"""
|
||||
Send email with HTML and plain text versions
|
||||
|
||||
Args:
|
||||
subject: Email subject line
|
||||
html_content: HTML email body
|
||||
text_content: Plain text email body
|
||||
|
||||
Returns:
|
||||
True if sent successfully, False otherwise
|
||||
"""
|
||||
try:
|
||||
# Create message
|
||||
msg = MIMEMultipart("alternative")
|
||||
msg["Subject"] = subject
|
||||
msg["From"] = f"{self.config.from_name} <{self.config.from_}>"
|
||||
msg["To"] = self.config.to
|
||||
msg["Date"] = formatdate(localtime=True)
|
||||
|
||||
# Attach parts
|
||||
part_text = MIMEText(text_content, "plain", "utf-8")
|
||||
part_html = MIMEText(html_content, "html", "utf-8")
|
||||
|
||||
msg.attach(part_text)
|
||||
msg.attach(part_html)
|
||||
|
||||
# Send via SMTP
|
||||
smtp_config = self.config.smtp
|
||||
|
||||
if smtp_config.use_ssl:
|
||||
server = smtplib.SMTP_SSL(smtp_config.host, smtp_config.port)
|
||||
else:
|
||||
server = smtplib.SMTP(smtp_config.host, smtp_config.port)
|
||||
|
||||
try:
|
||||
if smtp_config.use_tls and not smtp_config.use_ssl:
|
||||
server.starttls()
|
||||
|
||||
# Login if credentials provided
|
||||
if smtp_config.username and smtp_config.password:
|
||||
server.login(smtp_config.username, smtp_config.password)
|
||||
|
||||
# Send email
|
||||
server.send_message(msg)
|
||||
logger.info(f"Email sent successfully to {self.config.to}")
|
||||
return True
|
||||
|
||||
finally:
|
||||
server.quit()
|
||||
|
||||
except smtplib.SMTPException as e:
|
||||
logger.error(f"SMTP error sending email: {e}")
|
||||
return False
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error sending email: {e}")
|
||||
return False
|
||||
194
src/email/templates/daily_digest.html
Normal file
194
src/email/templates/daily_digest.html
Normal file
@@ -0,0 +1,194 @@
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>{{ title }}</title>
|
||||
<style>
|
||||
body {
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, sans-serif;
|
||||
line-height: 1.6;
|
||||
color: #333;
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
padding: 20px;
|
||||
background-color: #f5f5f5;
|
||||
}
|
||||
.container {
|
||||
background-color: #ffffff;
|
||||
border-radius: 8px;
|
||||
padding: 30px;
|
||||
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
||||
}
|
||||
.header {
|
||||
border-bottom: 3px solid #2563eb;
|
||||
padding-bottom: 20px;
|
||||
margin-bottom: 30px;
|
||||
}
|
||||
h1 {
|
||||
color: #1e40af;
|
||||
margin: 0 0 10px 0;
|
||||
font-size: 28px;
|
||||
}
|
||||
.date {
|
||||
color: #6b7280;
|
||||
font-size: 14px;
|
||||
}
|
||||
.summary {
|
||||
background-color: #eff6ff;
|
||||
border-left: 4px solid #2563eb;
|
||||
padding: 15px;
|
||||
margin-bottom: 30px;
|
||||
border-radius: 4px;
|
||||
}
|
||||
.summary p {
|
||||
margin: 5px 0;
|
||||
font-size: 14px;
|
||||
}
|
||||
.category-section {
|
||||
margin-bottom: 40px;
|
||||
}
|
||||
.category-header {
|
||||
background-color: #f3f4f6;
|
||||
padding: 10px 15px;
|
||||
border-radius: 4px;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
.category-title {
|
||||
color: #374151;
|
||||
font-size: 20px;
|
||||
font-weight: 600;
|
||||
margin: 0;
|
||||
text-transform: capitalize;
|
||||
}
|
||||
.article {
|
||||
margin-bottom: 30px;
|
||||
padding-bottom: 25px;
|
||||
border-bottom: 1px solid #e5e7eb;
|
||||
}
|
||||
.article:last-child {
|
||||
border-bottom: none;
|
||||
}
|
||||
.article-header {
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
.article-title {
|
||||
font-size: 18px;
|
||||
font-weight: 600;
|
||||
color: #1e40af;
|
||||
text-decoration: none;
|
||||
line-height: 1.4;
|
||||
}
|
||||
.article-title:hover {
|
||||
color: #2563eb;
|
||||
text-decoration: underline;
|
||||
}
|
||||
.article-meta {
|
||||
font-size: 13px;
|
||||
color: #6b7280;
|
||||
margin: 8px 0;
|
||||
}
|
||||
.article-meta span {
|
||||
margin-right: 15px;
|
||||
}
|
||||
.score-badge {
|
||||
display: inline-block;
|
||||
background-color: #dcfce7;
|
||||
color: #166534;
|
||||
padding: 3px 8px;
|
||||
border-radius: 12px;
|
||||
font-size: 12px;
|
||||
font-weight: 600;
|
||||
}
|
||||
.article-summary {
|
||||
color: #374151;
|
||||
font-size: 15px;
|
||||
line-height: 1.6;
|
||||
margin: 12px 0;
|
||||
}
|
||||
.read-more {
|
||||
display: inline-block;
|
||||
color: #2563eb;
|
||||
text-decoration: none;
|
||||
font-size: 14px;
|
||||
font-weight: 500;
|
||||
margin-top: 8px;
|
||||
}
|
||||
.read-more:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
.footer {
|
||||
margin-top: 40px;
|
||||
padding-top: 20px;
|
||||
border-top: 2px solid #e5e7eb;
|
||||
text-align: center;
|
||||
color: #6b7280;
|
||||
font-size: 13px;
|
||||
}
|
||||
@media (max-width: 600px) {
|
||||
body {
|
||||
padding: 10px;
|
||||
}
|
||||
.container {
|
||||
padding: 20px;
|
||||
}
|
||||
h1 {
|
||||
font-size: 24px;
|
||||
}
|
||||
.article-title {
|
||||
font-size: 16px;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="header">
|
||||
<h1>{{ title }}</h1>
|
||||
<div class="date">{{ date }}</div>
|
||||
</div>
|
||||
|
||||
<div class="summary">
|
||||
<p><strong>{{ total_articles }}</strong> articles curated from your personalized news sources</p>
|
||||
<p><strong>{{ total_sources }}</strong> sources | <strong>{{ total_categories }}</strong> categories</p>
|
||||
</div>
|
||||
|
||||
{% for category, articles in articles_by_category.items() %}
|
||||
<div class="category-section">
|
||||
<div class="category-header">
|
||||
<h2 class="category-title">{{ category }}</h2>
|
||||
</div>
|
||||
|
||||
{% for entry in articles %}
|
||||
<article class="article">
|
||||
<div class="article-header">
|
||||
<a href="{{ entry.article.url }}" class="article-title" target="_blank" rel="noopener">
|
||||
{{ entry.article.title }}
|
||||
</a>
|
||||
</div>
|
||||
|
||||
<div class="article-meta">
|
||||
<span>{{ entry.article.source }}</span>
|
||||
<span>{{ entry.article.published.strftime('%B %d, %Y at %H:%M') }}</span>
|
||||
<span class="score-badge">Relevance: {{ "%.1f"|format(entry.relevance_score) }}/10</span>
|
||||
</div>
|
||||
|
||||
<div class="article-summary">
|
||||
{{ entry.ai_summary }}
|
||||
</div>
|
||||
|
||||
<a href="{{ entry.article.url }}" class="read-more" target="_blank" rel="noopener">
|
||||
Read full article →
|
||||
</a>
|
||||
</article>
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endfor %}
|
||||
|
||||
<div class="footer">
|
||||
<p>Generated by News Agent | Powered by OpenRouter AI</p>
|
||||
<p>You received this because you subscribed to daily tech news digests</p>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
54
src/logger.py
Normal file
54
src/logger.py
Normal file
@@ -0,0 +1,54 @@
|
||||
"""Logging configuration for News Agent"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
from logging.handlers import RotatingFileHandler
|
||||
from pathlib import Path
|
||||
|
||||
from .config import get_config
|
||||
|
||||
|
||||
def setup_logger(name: str = "news-agent") -> logging.Logger:
|
||||
"""Setup and configure logger with file and console handlers"""
|
||||
config = get_config()
|
||||
log_config = config.logging
|
||||
|
||||
# Create logger
|
||||
logger = logging.getLogger(name)
|
||||
logger.setLevel(getattr(logging, log_config.level.upper()))
|
||||
|
||||
# Avoid duplicate handlers
|
||||
if logger.handlers:
|
||||
return logger
|
||||
|
||||
# Create formatters
|
||||
detailed_formatter = logging.Formatter(
|
||||
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
simple_formatter = logging.Formatter("%(levelname)s - %(message)s")
|
||||
|
||||
# File handler with rotation
|
||||
log_file = Path(log_config.file)
|
||||
log_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
file_handler = RotatingFileHandler(
|
||||
log_file, maxBytes=log_config.max_bytes, backupCount=log_config.backup_count
|
||||
)
|
||||
file_handler.setLevel(logging.DEBUG)
|
||||
file_handler.setFormatter(detailed_formatter)
|
||||
|
||||
# Console handler
|
||||
console_handler = logging.StreamHandler(sys.stdout)
|
||||
console_handler.setLevel(logging.INFO)
|
||||
console_handler.setFormatter(simple_formatter)
|
||||
|
||||
# Add handlers
|
||||
logger.addHandler(file_handler)
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
|
||||
|
||||
def get_logger(name: str = "news-agent") -> logging.Logger:
|
||||
"""Get or create logger instance"""
|
||||
return logging.getLogger(name)
|
||||
153
src/main.py
Normal file
153
src/main.py
Normal file
@@ -0,0 +1,153 @@
|
||||
"""Main orchestrator for News Agent"""
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
|
||||
from .config import get_config
|
||||
from .logger import setup_logger, get_logger
|
||||
from .storage.database import Database
|
||||
from .storage.models import DigestEntry
|
||||
from .aggregator.rss_fetcher import RSSFetcher
|
||||
from .ai.client import OpenRouterClient
|
||||
from .ai.filter import ArticleFilter
|
||||
from .ai.summarizer import ArticleSummarizer
|
||||
from .email.generator import EmailGenerator
|
||||
from .email.sender import EmailSender
|
||||
|
||||
|
||||
async def main():
|
||||
"""Main execution flow"""
|
||||
# Setup logging
|
||||
setup_logger()
|
||||
logger = get_logger()
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("News Agent starting...")
|
||||
logger.info("=" * 60)
|
||||
|
||||
try:
|
||||
# Load configuration
|
||||
config = get_config()
|
||||
|
||||
# Initialize database
|
||||
db = Database(config.database.path)
|
||||
await db.initialize()
|
||||
|
||||
# Clean up old articles
|
||||
await db.cleanup_old_articles(config.database.retention_days)
|
||||
|
||||
# Initialize RSS fetcher
|
||||
fetcher = RSSFetcher()
|
||||
|
||||
# Fetch articles from all sources
|
||||
logger.info(f"Fetching from {len(config.rss_sources)} RSS sources...")
|
||||
articles = await fetcher.fetch_all(config.rss_sources)
|
||||
|
||||
if not articles:
|
||||
logger.warning("No articles fetched from any source")
|
||||
await fetcher.close()
|
||||
return
|
||||
|
||||
# Save articles to database (deduplication)
|
||||
new_articles_count = await db.save_articles(articles)
|
||||
logger.info(
|
||||
f"Saved {new_articles_count} new articles (filtered {len(articles) - new_articles_count} duplicates)"
|
||||
)
|
||||
|
||||
await fetcher.close()
|
||||
|
||||
# Get unprocessed articles
|
||||
unprocessed = await db.get_unprocessed_articles()
|
||||
|
||||
if not unprocessed:
|
||||
logger.info("No new articles to process")
|
||||
return
|
||||
|
||||
logger.info(f"Processing {len(unprocessed)} new articles with AI...")
|
||||
|
||||
# Initialize AI components
|
||||
ai_client = OpenRouterClient()
|
||||
filter_ai = ArticleFilter(ai_client)
|
||||
summarizer = ArticleSummarizer(ai_client)
|
||||
|
||||
# Filter articles by relevance
|
||||
logger.info("Filtering articles by relevance...")
|
||||
filtered_articles = await filter_ai.filter_articles(
|
||||
unprocessed, max_articles=config.ai.filtering.max_articles
|
||||
)
|
||||
|
||||
if not filtered_articles:
|
||||
logger.warning("No relevant articles found after filtering")
|
||||
# Mark all as processed but not included
|
||||
for article in unprocessed:
|
||||
await db.update_article_processing(
|
||||
article.id, relevance_score=0.0, ai_summary="", included=False
|
||||
)
|
||||
return
|
||||
|
||||
logger.info(f"Selected {len(filtered_articles)} relevant articles")
|
||||
|
||||
# Summarize filtered articles
|
||||
logger.info("Generating AI summaries...")
|
||||
digest_entries = []
|
||||
|
||||
for article, score in filtered_articles:
|
||||
summary = await summarizer.summarize(article)
|
||||
|
||||
# Update database
|
||||
await db.update_article_processing(
|
||||
article.id, relevance_score=score, ai_summary=summary, included=True
|
||||
)
|
||||
|
||||
# Create digest entry
|
||||
entry = DigestEntry(
|
||||
article=article,
|
||||
relevance_score=score,
|
||||
ai_summary=summary,
|
||||
category=article.category,
|
||||
)
|
||||
digest_entries.append(entry)
|
||||
|
||||
# Mark remaining unprocessed articles as processed but not included
|
||||
processed_ids = {article.id for article, _ in filtered_articles}
|
||||
for article in unprocessed:
|
||||
if article.id not in processed_ids:
|
||||
await db.update_article_processing(
|
||||
article.id, relevance_score=0.0, ai_summary="", included=False
|
||||
)
|
||||
|
||||
# Generate email
|
||||
logger.info("Generating email digest...")
|
||||
generator = EmailGenerator()
|
||||
|
||||
date_str = datetime.now().strftime("%A, %B %d, %Y")
|
||||
subject = config.email.subject_template.format(date=date_str)
|
||||
|
||||
html_content, text_content = generator.generate_digest_email(
|
||||
digest_entries, date_str, subject
|
||||
)
|
||||
|
||||
# Send email
|
||||
logger.info("Sending email...")
|
||||
sender = EmailSender()
|
||||
success = sender.send(subject, html_content, text_content)
|
||||
|
||||
if success:
|
||||
logger.info("=" * 60)
|
||||
logger.info(f"Daily digest sent successfully with {len(digest_entries)} articles!")
|
||||
logger.info("=" * 60)
|
||||
else:
|
||||
logger.error("Failed to send email")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Fatal error in main execution: {e}", exc_info=True)
|
||||
raise
|
||||
|
||||
|
||||
def run():
|
||||
"""Entry point for command-line execution"""
|
||||
asyncio.run(main())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
1
src/storage/__init__.py
Normal file
1
src/storage/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Database storage for articles"""
|
||||
194
src/storage/database.py
Normal file
194
src/storage/database.py
Normal file
@@ -0,0 +1,194 @@
|
||||
"""SQLite database operations for article storage and deduplication"""
|
||||
|
||||
import aiosqlite
|
||||
import json
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from .models import Article
|
||||
from ..logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
class Database:
|
||||
"""Async SQLite database manager"""
|
||||
|
||||
def __init__(self, db_path: str | Path):
|
||||
self.db_path = Path(db_path)
|
||||
self.db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
async def initialize(self):
|
||||
"""Create database tables if they don't exist"""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
await db.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS articles (
|
||||
id TEXT PRIMARY KEY,
|
||||
url TEXT NOT NULL UNIQUE,
|
||||
title TEXT NOT NULL,
|
||||
summary TEXT,
|
||||
content TEXT NOT NULL,
|
||||
published TEXT NOT NULL,
|
||||
source TEXT NOT NULL,
|
||||
category TEXT NOT NULL,
|
||||
fetched_at TEXT NOT NULL,
|
||||
relevance_score REAL,
|
||||
ai_summary TEXT,
|
||||
processed INTEGER DEFAULT 0,
|
||||
included_in_digest INTEGER DEFAULT 0
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
await db.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_published
|
||||
ON articles(published)
|
||||
"""
|
||||
)
|
||||
|
||||
await db.execute(
|
||||
"""
|
||||
CREATE INDEX IF NOT EXISTS idx_fetched_at
|
||||
ON articles(fetched_at)
|
||||
"""
|
||||
)
|
||||
|
||||
await db.commit()
|
||||
|
||||
logger.info(f"Database initialized at {self.db_path}")
|
||||
|
||||
async def article_exists(self, article_id: str) -> bool:
|
||||
"""Check if article already exists in database"""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
async with db.execute("SELECT 1 FROM articles WHERE id = ?", (article_id,)) as cursor:
|
||||
result = await cursor.fetchone()
|
||||
return result is not None
|
||||
|
||||
async def save_article(self, article: Article) -> bool:
|
||||
"""Save article to database. Returns True if saved, False if duplicate"""
|
||||
if await self.article_exists(article.id):
|
||||
logger.debug(f"Article already exists: {article.title}")
|
||||
return False
|
||||
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
await db.execute(
|
||||
"""
|
||||
INSERT INTO articles (
|
||||
id, url, title, summary, content, published, source,
|
||||
category, fetched_at, relevance_score, ai_summary,
|
||||
processed, included_in_digest
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""",
|
||||
(
|
||||
article.id,
|
||||
str(article.url),
|
||||
article.title,
|
||||
article.summary,
|
||||
article.content,
|
||||
article.published.isoformat(),
|
||||
article.source,
|
||||
article.category,
|
||||
article.fetched_at.isoformat(),
|
||||
article.relevance_score,
|
||||
article.ai_summary,
|
||||
int(article.processed),
|
||||
int(article.included_in_digest),
|
||||
),
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
logger.debug(f"Saved article: {article.title}")
|
||||
return True
|
||||
|
||||
async def save_articles(self, articles: list[Article]) -> int:
|
||||
"""Save multiple articles. Returns count of new articles saved"""
|
||||
count = 0
|
||||
for article in articles:
|
||||
if await self.save_article(article):
|
||||
count += 1
|
||||
return count
|
||||
|
||||
async def get_unprocessed_articles(self, limit: Optional[int] = None) -> list[Article]:
|
||||
"""Get articles that haven't been processed by AI yet"""
|
||||
query = """
|
||||
SELECT * FROM articles
|
||||
WHERE processed = 0
|
||||
ORDER BY published DESC
|
||||
"""
|
||||
if limit:
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
async with db.execute(query) as cursor:
|
||||
rows = await cursor.fetchall()
|
||||
return [self._row_to_article(row) for row in rows]
|
||||
|
||||
async def update_article_processing(
|
||||
self, article_id: str, relevance_score: float, ai_summary: str, included: bool
|
||||
):
|
||||
"""Update article with AI processing results"""
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
await db.execute(
|
||||
"""
|
||||
UPDATE articles
|
||||
SET relevance_score = ?,
|
||||
ai_summary = ?,
|
||||
processed = 1,
|
||||
included_in_digest = ?
|
||||
WHERE id = ?
|
||||
""",
|
||||
(relevance_score, ai_summary, int(included), article_id),
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
async def get_todays_digest_articles(self) -> list[Article]:
|
||||
"""Get all articles included in today's digest"""
|
||||
today = datetime.now().date()
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
db.row_factory = aiosqlite.Row
|
||||
async with db.execute(
|
||||
"""
|
||||
SELECT * FROM articles
|
||||
WHERE included_in_digest = 1
|
||||
AND date(fetched_at) = ?
|
||||
ORDER BY relevance_score DESC, published DESC
|
||||
""",
|
||||
(today.isoformat(),),
|
||||
) as cursor:
|
||||
rows = await cursor.fetchall()
|
||||
return [self._row_to_article(row) for row in rows]
|
||||
|
||||
async def cleanup_old_articles(self, retention_days: int):
|
||||
"""Delete articles older than retention period"""
|
||||
cutoff_date = datetime.now() - timedelta(days=retention_days)
|
||||
async with aiosqlite.connect(self.db_path) as db:
|
||||
cursor = await db.execute(
|
||||
"DELETE FROM articles WHERE fetched_at < ?", (cutoff_date.isoformat(),)
|
||||
)
|
||||
deleted = cursor.rowcount
|
||||
await db.commit()
|
||||
|
||||
if deleted > 0:
|
||||
logger.info(f"Cleaned up {deleted} old articles")
|
||||
|
||||
def _row_to_article(self, row: aiosqlite.Row) -> Article:
|
||||
"""Convert database row to Article model"""
|
||||
return Article(
|
||||
id=row["id"],
|
||||
url=row["url"],
|
||||
title=row["title"],
|
||||
summary=row["summary"],
|
||||
content=row["content"],
|
||||
published=datetime.fromisoformat(row["published"]),
|
||||
source=row["source"],
|
||||
category=row["category"],
|
||||
fetched_at=datetime.fromisoformat(row["fetched_at"]),
|
||||
relevance_score=row["relevance_score"],
|
||||
ai_summary=row["ai_summary"],
|
||||
processed=bool(row["processed"]),
|
||||
included_in_digest=bool(row["included_in_digest"]),
|
||||
)
|
||||
44
src/storage/models.py
Normal file
44
src/storage/models.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""Data models for articles and digests"""
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, HttpUrl
|
||||
|
||||
|
||||
class Article(BaseModel):
|
||||
"""Article data model"""
|
||||
|
||||
id: str # Hash of URL
|
||||
url: HttpUrl
|
||||
title: str
|
||||
summary: Optional[str] = None
|
||||
content: str
|
||||
published: datetime
|
||||
source: str
|
||||
category: str
|
||||
fetched_at: datetime
|
||||
|
||||
# AI processing fields
|
||||
relevance_score: Optional[float] = None
|
||||
ai_summary: Optional[str] = None
|
||||
processed: bool = False
|
||||
included_in_digest: bool = False
|
||||
|
||||
|
||||
class DigestEntry(BaseModel):
|
||||
"""Entry in daily digest"""
|
||||
|
||||
article: Article
|
||||
relevance_score: float
|
||||
ai_summary: str
|
||||
category: str
|
||||
|
||||
|
||||
class DailyDigest(BaseModel):
|
||||
"""Complete daily digest"""
|
||||
|
||||
date: str
|
||||
entries: list[DigestEntry]
|
||||
total_articles_processed: int
|
||||
total_articles_included: int
|
||||
1
src/utils/__init__.py
Normal file
1
src/utils/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""Utility functions"""
|
||||
Reference in New Issue
Block a user