diff --git a/.changeset/rag-context-budget.md b/.changeset/rag-context-budget.md new file mode 100644 index 0000000..ece48d4 --- /dev/null +++ b/.changeset/rag-context-budget.md @@ -0,0 +1,13 @@ +--- +"think-app": minor +--- + +Maximize RAG context utilization with dynamic budget system and improve chat UX + +- Add dynamic context budget that scales with model context window size +- Widen distance thresholds proportionally when budget allows more content +- Include AI-generated summaries as context overview/fallback for each memory +- Subtract page content from budget in browser extension chat to prevent overflow +- Rewrite system prompt for better grounding and natural source attribution +- Fix missing bullet point markers in chat message markdown rendering +- Remove hover blur effect from assistant chat message bubbles diff --git a/app/src/components/ChatMessage.tsx b/app/src/components/ChatMessage.tsx index be9d98b..d7bf237 100644 --- a/app/src/components/ChatMessage.tsx +++ b/app/src/components/ChatMessage.tsx @@ -42,7 +42,7 @@ export function ChatMessage({ message }: ChatMessageProps) { "p-4 rounded-2xl", isUser ? "bg-primary text-primary-foreground" - : cn(glass.base, glass.hover) + : glass.base )} > {isUser ? ( diff --git a/app/src/index.css b/app/src/index.css index 8eb0d0d..549b295 100644 --- a/app/src/index.css +++ b/app/src/index.css @@ -153,9 +153,15 @@ .chat-prose p:last-child { margin-bottom: 0; } - .chat-prose ul, .chat-prose ol { + .chat-prose ul { margin: 0.5em 0; padding-left: 1.5em; + list-style-type: disc; + } + .chat-prose ol { + margin: 0.5em 0; + padding-left: 1.5em; + list-style-type: decimal; } .chat-prose li { margin: 0.25em 0; diff --git a/backend/app/native_messaging.py b/backend/app/native_messaging.py index 349ef64..a650b77 100644 --- a/backend/app/native_messaging.py +++ b/backend/app/native_messaging.py @@ -14,11 +14,13 @@ from .services.embeddings import ( get_embedding, get_current_embedding_model, filter_memories_dynamically, format_memories_as_context, + compute_context_budget, ) from .services.ai import ( - chat, process_memory_async, process_conversation_title_async, + chat, get_model, process_memory_async, process_conversation_title_async, maybe_rewrite_query, generate_followup_suggestions, ) +from .models_info import get_context_window from .services.query import preprocess_query, extract_keywords, is_special_prompt, execute_special_handler from .events import event_manager, MemoryEvent, EventType @@ -283,6 +285,14 @@ async def _chat_message(self, params: dict) -> dict: sources = [] return_page_summary = None + # Compute context budget based on model and conversation history + model = get_model() + context_window = get_context_window(model) + # Account for page content that will also consume context space + page_content_chars = min(len(page_content), 8000) if page_content else 0 + context_budget = compute_context_budget(context_window, history) + context_budget = max(2000, context_budget - page_content_chars) + # Generate page summary for memory search (only on first message) if page_content and not page_summary: page_summary = await self._generate_page_summary(page_content, page_title) @@ -333,7 +343,7 @@ async def _chat_message(self, params: dict) -> dict: # Use model-specific thresholds for filtering embedding_model = get_current_embedding_model() filtered_memories = filter_memories_dynamically( - memories, embedding_model=embedding_model + memories, embedding_model=embedding_model, context_budget_chars=context_budget ) # Build sources list for the response @@ -349,7 +359,7 @@ async def _chat_message(self, params: dict) -> dict: ] # Format memories as context - memories_context = format_memories_as_context(filtered_memories) + memories_context = format_memories_as_context(filtered_memories, max_chars=context_budget) if memories_context: context_parts.append(memories_context) diff --git a/backend/app/routes/chat.py b/backend/app/routes/chat.py index d00fc4f..5112182 100644 --- a/backend/app/routes/chat.py +++ b/backend/app/routes/chat.py @@ -14,7 +14,7 @@ from ..services.ai.query_rewriting import maybe_rewrite_query from ..services.ai.suggestions import get_quick_prompts, generate_followup_suggestions from ..services.query.special_handlers import is_special_prompt, execute_special_handler -from ..services.embeddings.filtering import filter_memories_dynamically, format_memories_as_context +from ..services.embeddings.filtering import filter_memories_dynamically, format_memories_as_context, compute_context_budget from ..db.search import search_similar_memories from ..schemas import ChatRequest from .. import config @@ -66,6 +66,11 @@ async def _retrieve_context( sources = [] attached_context = "" + # Compute context budget based on model and conversation history + model = get_model() + context_window = get_context_window(model) + total_budget = compute_context_budget(context_window, history) + # Handle explicitly attached memories first if attached_memory_ids: attached_memories = [] @@ -80,7 +85,9 @@ async def _retrieve_context( }) if attached_memories: - attached_context = "## User's Selected Memory:\n" + format_memories_as_context(attached_memories) + # Give attached memories up to half the budget; RAG gets the other half + attached_budget = total_budget // 2 + attached_context = "## User's Selected Memory:\n" + format_memories_as_context(attached_memories, max_chars=attached_budget) logger.info(f"Using {len(attached_memories)} attached memories as context") # Skip RAG for very short messages (< 10 chars) or when explicitly disabled @@ -118,9 +125,13 @@ async def _retrieve_context( ) if similar_memories: + # RAG budget: half if attached memories exist, full otherwise + rag_budget = total_budget // 2 if attached_context else total_budget # Filter using dynamic threshold with model-specific thresholds - filtered_memories = filter_memories_dynamically(similar_memories, embedding_model=embedding_model) - context = format_memories_as_context(filtered_memories) + filtered_memories = filter_memories_dynamically( + similar_memories, embedding_model=embedding_model, context_budget_chars=rag_budget + ) + context = format_memories_as_context(filtered_memories, max_chars=rag_budget) # Build sources list from filtered memories, avoiding duplicates with attached attached_ids = {s["id"] for s in sources} for m in filtered_memories: diff --git a/backend/app/services/ai/client.py b/backend/app/services/ai/client.py index fc17a1e..816a860 100644 --- a/backend/app/services/ai/client.py +++ b/backend/app/services/ai/client.py @@ -5,11 +5,15 @@ # Custom system prompt for Think -SYSTEM_PROMPT = """You are Think, a friendly personal assistant with access to the user's saved memories and notes. You help them recall information, answer questions, and have natural conversations. - -When context from their memories is provided, use it naturally to inform your responses without explicitly mentioning "your saved article" or "your memories" - just incorporate the knowledge seamlessly. - -Keep responses conversational and concise. Be helpful and warm, like a knowledgeable friend.""" +SYSTEM_PROMPT = """You are Think, a helpful personal assistant. You help users recall and explore information from their saved content. +The content can be owned by the user (like notes and voice memos) or from third-party sources (like web pages, videos, and audio). +You use this information to answer questions, provide summaries, and assist with tasks. + +Guidelines: +- Ground your answers in the provided context. If it doesn't contain enough information to answer, say so rather than guessing. +- Be conversational and concise. +- You can naturally reference sources (e.g. "from a saved article", "in one of your videos"), but never output the raw type tags like [web] or [video] in your responses. +""" async def get_client() -> AsyncOpenAI: diff --git a/backend/app/services/embeddings/__init__.py b/backend/app/services/embeddings/__init__.py index d6bd4ed..d703b0d 100644 --- a/backend/app/services/embeddings/__init__.py +++ b/backend/app/services/embeddings/__init__.py @@ -1,4 +1,4 @@ """Vector Search & Similarity services.""" from .client import get_embedding, cosine_similarity, get_current_embedding_model -from .filtering import filter_memories_dynamically, format_memories_as_context +from .filtering import filter_memories_dynamically, format_memories_as_context, compute_context_budget from .jobs import job_manager, reembed_worker, JobStatus diff --git a/backend/app/services/embeddings/filtering.py b/backend/app/services/embeddings/filtering.py index d7a1282..e95959f 100644 --- a/backend/app/services/embeddings/filtering.py +++ b/backend/app/services/embeddings/filtering.py @@ -1,6 +1,7 @@ """Shared memory filtering logic for RAG retrieval.""" import logging +from datetime import datetime logger = logging.getLogger(__name__) @@ -14,9 +15,56 @@ } DEFAULT_THRESHOLDS = {"excellent": 0.25, "good": 0.35, "cutoff": 0.45} +# Budget constants +BUDGET_FLOOR_CHARS = 2000 +BUDGET_CEILING_CHARS = 60000 +DEFAULT_BUDGET_CHARS = 8000 +SYSTEM_PROMPT_CHARS_ESTIMATE = 500 +RESPONSE_RESERVE_TOKENS = 1024 +CHARS_PER_TOKEN = 4 + + +def compute_context_budget( + context_window_tokens: int, + history: list[dict], + system_prompt_chars: int = SYSTEM_PROMPT_CHARS_ESTIMATE, + response_reserve_tokens: int = RESPONSE_RESERVE_TOKENS, +) -> int: + """Compute how many characters of memory context we can fit. + + Estimates token usage for system prompt, conversation history, and + response reserve, then returns the remaining space as characters. + Result is clamped between BUDGET_FLOOR_CHARS and BUDGET_CEILING_CHARS. + """ + # Estimate tokens used by history + history_chars = sum(len(m.get("content", "")) for m in history) + history_tokens = history_chars // CHARS_PER_TOKEN + + # Estimate tokens used by system prompt + system_tokens = system_prompt_chars // CHARS_PER_TOKEN + + # Reserve at least response_reserve_tokens or 10% of window, whichever is larger + effective_reserve = max(response_reserve_tokens, context_window_tokens // 10) + + # Available tokens for context + available_tokens = context_window_tokens - system_tokens - history_tokens - effective_reserve + + # Convert to chars + available_chars = available_tokens * CHARS_PER_TOKEN + + budget = max(BUDGET_FLOOR_CHARS, min(BUDGET_CEILING_CHARS, available_chars)) + logger.info( + f"Context budget: {budget} chars " + f"(window={context_window_tokens}, history_tokens={history_tokens}, " + f"available_tokens={available_tokens})" + ) + return budget + def filter_memories_dynamically( - memories: list[dict], max_results: int = 5, embedding_model: str | None = None + memories: list[dict], + embedding_model: str | None = None, + context_budget_chars: int = DEFAULT_BUDGET_CHARS, ) -> list[dict]: """Filter memories using distance-based relevance. @@ -26,6 +74,7 @@ def filter_memories_dynamically( - All match types (hybrid/keyword/vector) must pass distance check - Adaptive limits based on best match quality - Use model-specific thresholds when available + - Scale max_results caps based on context_budget_chars """ if not memories: logger.info("No memories to filter") @@ -38,17 +87,20 @@ def filter_memories_dynamically( else DEFAULT_THRESHOLDS ) + # Budget scaling factor: how much bigger is our budget vs the 8k default? + budget_scale = context_budget_chars / DEFAULT_BUDGET_CHARS + # Sort by distance (lowest/best first) sorted_memories = sorted(memories, key=lambda m: m.get("distance") or 999) # Log what we're working with - logger.info(f"Filtering {len(sorted_memories)} memories (model: {embedding_model})") + logger.info(f"Filtering {len(sorted_memories)} memories (model: {embedding_model}, budget_scale: {budget_scale:.1f}x)") for m in sorted_memories[:5]: dist = m.get("distance") dist_str = f"{dist:.3f}" if dist is not None else "N/A" rrf = m.get("rrf_score") or 0 rrf_str = f"{rrf:.4f}" if rrf else "N/A" - logger.info( + logger.debug( f" [{m.get('match_type', '?')}] {m.get('title', '')[:50]}... dist={dist_str} rrf={rrf_str}" ) @@ -62,18 +114,24 @@ def filter_memories_dynamically( # Calculate dynamic threshold: include results within range of best # Tighter range for better matches, looser for weaker ones + # Scale max_results based on budget (capped at reasonable limits) + # Widen acceptance range when we have more budget to fill: + # at 2x budget add +0.01, at 4x add +0.02, capped at +0.03 + budget_bonus = min(0.03, max(0.0, (budget_scale - 1.0) * 0.01)) + if best_distance < thresholds["excellent"]: - # Excellent match: include results within +0.08 - threshold = best_distance + 0.08 - max_results = 5 + threshold = best_distance + 0.08 + budget_bonus + max_results = min(10, max(5, int(5 * budget_scale))) elif best_distance < thresholds["good"]: - # Good match: include results within +0.06 - threshold = best_distance + 0.06 - max_results = 3 + threshold = best_distance + 0.06 + budget_bonus + max_results = min(8, max(3, int(3 * budget_scale))) else: - # Marginal match: only include very close results - threshold = best_distance + 0.04 - max_results = 2 + # Marginal match: half the bonus to stay conservative + threshold = best_distance + 0.04 + (budget_bonus * 0.5) + max_results = min(4, max(2, int(2 * budget_scale))) + + # Never exceed the model's absolute cutoff + threshold = min(threshold, thresholds["cutoff"]) logger.info( f"Best distance: {best_distance:.3f}, threshold: {threshold:.3f}, max: {max_results}" @@ -88,44 +146,137 @@ def filter_memories_dynamically( continue if distance <= threshold: - logger.info( + logger.debug( f" Including [{match_type}] (dist={distance:.3f}): {m.get('title', '')[:30]}" ) filtered.append(m) else: - logger.info( + logger.debug( f" Excluding [{match_type}] (dist={distance:.3f} > {threshold:.3f}): {m.get('title', '')[:30]}" ) result = filtered[:max_results] - logger.info(f"Filtered to {len(result)} memories") + logger.info( + f"Filter stats: {{" + f"input: {len(memories)}, output: {len(result)}, " + f"best_distance: {best_distance:.4f}, threshold: {threshold:.4f}, " + f"budget_scale: {budget_scale:.2f}, budget_bonus: {budget_bonus:.4f}, " + f"max_results_cap: {max_results}, model: {embedding_model or 'default'}" + f"}}" + ) return result -def format_memories_as_context(memories: list[dict], max_chars: int = 8000) -> str: +def _format_metadata_line(memory: dict) -> str: + """Build a compact metadata line: memory type + saved date.""" + memory_type = memory.get("type") or memory.get("memory_type") or "note" + created = memory.get("created_at") or memory.get("date") + if created: + if isinstance(created, str): + try: + dt = datetime.fromisoformat(created.replace("Z", "+00:00")) + date_str = dt.strftime("%Y-%m-%d") + except (ValueError, TypeError): + date_str = "" + elif isinstance(created, datetime): + date_str = created.strftime("%Y-%m-%d") + else: + date_str = "" + else: + date_str = "" + parts = [memory_type] + if date_str: + parts.append(f"saved {date_str}") + return f"[{' | '.join(parts)}]" + + +def format_memories_as_context(memories: list[dict], max_chars: int = DEFAULT_BUDGET_CHARS) -> str: """Format retrieved memories into a context string for the LLM. Expects memories to be pre-filtered by filter_memories_dynamically. - Uses more generous limits to give LLM more context for valuable answers. + Dynamically allocates space per memory and includes AI-generated + summaries as overview (for long content) or fallback (when truncated). """ if not memories: return "" + num_memories = len(memories) + # Per-memory char budget (floor of 2000 to keep each entry meaningful) + per_memory_limit = max(2000, max_chars // num_memories) + context_parts = [] total_chars = 0 + summaries_used = 0 for memory in memories: title = memory.get("title", "Untitled") content = memory.get("content", "") + summary = memory.get("summary") or "" + metadata = _format_metadata_line(memory) + + # Build the entry + header = f"### {title}\n{metadata}" + header_len = len(header) + 1 # +1 for newline - # Truncate content if too long (increased from 800 to 2000 for richer context) - if len(content) > 2000: - content = content[:2000] + "..." + # Remaining space for this memory's body + body_budget = min(per_memory_limit, max_chars - total_chars) - header_len + if body_budget <= 0: + # Try to squeeze in just the summary as a last-resort entry + if summary: + summary_entry = f"{header}\n{summary}" + if total_chars + len(summary_entry) <= max_chars: + context_parts.append(summary_entry) + total_chars += len(summary_entry) + break + + # Decide what body content to include + content_needs_truncation = len(content) > body_budget + summary_included = False + + if summary and not content_needs_truncation: + # Content fits fully: include summary as overview + full content + body = f"{summary}\n\n{content}" + if len(body) > body_budget: + # Summary + full content exceeds budget, just use content + body = content + else: + summary_included = True + elif content_needs_truncation and summary: + # Content must be truncated: include summary as fallback + truncated content + # Reserve space for summary line + summary_line = f"{summary}\n\n" + content_budget = body_budget - len(summary_line) + if content_budget > 200: + body = f"{summary_line}{content[:content_budget]}..." + summary_included = True + else: + # Not enough room for both; use summary alone + body = summary + summary_included = True + else: + # No summary available: just use content directly + if len(content) > body_budget: + body = content[:body_budget] + "..." + else: + body = content - entry = f"### {title}\n{content}" + if summary_included: + summaries_used += 1 - # Check if adding this would exceed limit + entry = f"{header}\n{body}" + + # Final check against overall budget if total_chars + len(entry) > max_chars: + # Try a trimmed version + remaining = max_chars - total_chars + if remaining > header_len + 200: + trim_budget = remaining - header_len - 1 + if summary and len(summary) <= trim_budget: + entry = f"{header}\n{summary}" + else: + entry = f"{header}\n{content[:trim_budget]}..." + context_parts.append(entry) + total_chars += len(entry) break context_parts.append(entry) @@ -134,4 +285,10 @@ def format_memories_as_context(memories: list[dict], max_chars: int = 8000) -> s if not context_parts: return "" - return "## Relevant Memories:\n\n" + "\n\n---\n\n".join(context_parts) + result = "## Relevant Memories:\n\n" + "\n\n---\n\n".join(context_parts) + utilization_pct = round((len(result) / max_chars) * 100, 1) if max_chars > 0 else 0 + logger.info( + f"Formatted {len(context_parts)} memories into {len(result)} chars " + f"(budget: {max_chars}, utilization: {utilization_pct}%, summaries: {summaries_used}/{len(context_parts)})" + ) + return result