diff --git a/.env.example b/.env.example index b1274db6..d0ed3430 100644 --- a/.env.example +++ b/.env.example @@ -88,12 +88,15 @@ TTS_BINDING_API_VERSION= # ============================================================================== # Optional: Enable web search capabilities -# [Optional] Provider: perplexity, tavily, serper, jina, exa +# [Optional] Provider: perplexity, tavily, serper, jina, exa, baidu, searxng SEARCH_PROVIDER=perplexity # [Optional] API key for your chosen search provider SEARCH_API_KEY=pplx-xxx +# [Optional] Base URL for self-hosted search providers (e.g., SearXNG) +SEARCH_BASE_URL= + # ============================================================================== # Cloud Deployment Configuration # ============================================================================== diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 535e8d83..0ec57968 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,7 +2,7 @@ Contributing to DeepTutor 🚀 Thank you for your interest in contributing to DeepTutor! We are committed to building a smooth and robust intelligent learning companion, and we welcome developers of all skill levels to join us. Join our community for discussion, support, and collaboration:

-Discord  +Discord  WeChat  Feishu

diff --git a/README.md b/README.md index 33a3166f..c7067393 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ [![License](https://img.shields.io/badge/License-AGPL--3.0-blue?style=flat-square)](LICENSE)

- Discord + Discord    Feishu    @@ -287,8 +287,9 @@ cp .env.example .env | `FRONTEND_PORT` | No | Frontend port (default: `3782`) | | `NEXT_PUBLIC_API_BASE` | No | **Frontend API URL** - Set this for remote/LAN access (e.g., `http://192.168.1.100:8001`) | | `TTS_*` | No | Text-to-Speech settings | -| `SEARCH_PROVIDER` | No | Search provider (options: `perplexity`, `tavily`, `serper`, `jina`, `exa`, `baidu`, default: `perplexity`) | +| `SEARCH_PROVIDER` | No | Search provider (options: `perplexity`, `tavily`, `serper`, `jina`, `exa`, `baidu`, `searxng`, default: `perplexity`) | | `SEARCH_API_KEY` | No | Unified API key for all search providers | +| `SEARCH_BASE_URL` | No | Base URL for self-hosted search providers (e.g., `http://localhost:8888` for SearXNG) | > 💡 **Remote Access**: If accessing from another device (e.g., `192.168.31.66:3782`), add to `.env`: > ```bash diff --git a/assets/roster/forkers.svg b/assets/roster/forkers.svg index 5a23cf1d..d42ce14a 100644 --- a/assets/roster/forkers.svg +++ b/assets/roster/forkers.svg @@ -10,21 +10,21 @@ Forkers - + - + - + - + - + - -and 1,108 others + +and 1,127 others \ No newline at end of file diff --git a/assets/roster/stargazers.svg b/assets/roster/stargazers.svg index 300b5b44..25514294 100644 --- a/assets/roster/stargazers.svg +++ b/assets/roster/stargazers.svg @@ -26,5 +26,5 @@ -and 8,489 others +and 8,672 others \ No newline at end of file diff --git a/src/api/routers/config.py b/src/api/routers/config.py index c77c1345..71e1ecf6 100644 --- a/src/api/routers/config.py +++ b/src/api/routers/config.py @@ -70,11 +70,15 @@ class SearchConfigCreate(ConfigBase): """Search configuration for creation. Uses unified SEARCH_API_KEY environment variable. + SearXNG provider also supports base_url configuration. """ api_key: str | Dict[str, str] = Field( ..., description="API key or {'use_env': 'SEARCH_API_KEY'}" ) + base_url: Optional[str | Dict[str, str]] = Field( + None, description="Base URL for self-hosted providers like SearXNG" + ) class ConfigUpdate(BaseModel): diff --git a/src/services/config/unified_config.py b/src/services/config/unified_config.py index f33f2397..5770c639 100644 --- a/src/services/config/unified_config.py +++ b/src/services/config/unified_config.py @@ -49,7 +49,7 @@ class ConfigType(str, Enum): ], ConfigType.EMBEDDING: ["openai", "azure_openai", "ollama", "jina", "cohere", "huggingface"], ConfigType.TTS: ["openai", "azure_openai"], - ConfigType.SEARCH: ["perplexity", "tavily", "exa", "jina", "serper", "baidu"], + ConfigType.SEARCH: ["perplexity", "tavily", "exa", "jina", "serper", "baidu", "searxng"], } # Environment variable mappings for each service type @@ -80,6 +80,7 @@ class ConfigType(str, Enum): ConfigType.SEARCH: { "provider": "SEARCH_PROVIDER", "api_key": "SEARCH_API_KEY", # Unified API key for all providers + "base_url": "SEARCH_BASE_URL", # For self-hosted providers like SearXNG }, } @@ -246,6 +247,7 @@ def _build_stored_default_config( **base_config, "provider": _get_env_value(env_mapping.get("provider")) or "perplexity", "api_key": {"use_env": "SEARCH_API_KEY"}, + "base_url": {"use_env": "SEARCH_BASE_URL"}, } return base_config @@ -327,6 +329,7 @@ def _build_default_config(self, config_type: ConfigType) -> Dict[str, Any]: "is_default": True, "provider": provider, "api_key": "***", + "base_url": _get_env_value(env_mapping.get("base_url")) or "", } return {"id": "default", "name": "Default (from .env)", "is_default": True} @@ -374,6 +377,7 @@ def _get_default_config_resolved(self, config_type: ConfigType) -> Dict[str, Any "id": "default", "provider": provider, "api_key": _get_env_value(env_mapping.get("api_key")) or "", + "base_url": _get_env_value(env_mapping.get("base_url")) or "", } return {"id": "default"} diff --git a/src/services/search/__init__.py b/src/services/search/__init__.py index a21a7329..6ddcffad 100644 --- a/src/services/search/__init__.py +++ b/src/services/search/__init__.py @@ -41,7 +41,7 @@ from typing import Any from src.logging import get_logger -from src.services.config import PROJECT_ROOT, load_config_with_main +from src.services.config import PROJECT_ROOT, get_active_search_config, load_config_with_main from .base import SEARCH_API_KEY_ENV, BaseSearchProvider from .consolidation import CONSOLIDATION_TYPES, PROVIDER_TEMPLATES, AnswerConsolidator @@ -153,9 +153,16 @@ def web_search( "provider": "disabled", } - # Determine provider: function arg > env var > config > default + # Get unified config for active search settings + unified_config = get_active_search_config() or {} + + # Determine provider: function arg > unified config > env var > yaml config > default provider_name = ( - provider or os.environ.get("SEARCH_PROVIDER") or config.get("provider") or "perplexity" + provider + or unified_config.get("provider") + or os.environ.get("SEARCH_PROVIDER") + or config.get("provider") + or "perplexity" ).lower() # Determine consolidation from config if not provided @@ -172,8 +179,16 @@ def web_search( provider_kwargs.setdefault("enable_deep_search", baidu_enable_deep_search) provider_kwargs.setdefault("search_recency_filter", baidu_search_recency_filter) + # Pass api_key from unified config if available + if unified_config.get("api_key"): + provider_kwargs.setdefault("api_key", unified_config["api_key"]) + + # Pass base_url from unified config for providers that need it (e.g., SearXNG) + if provider_name == "searxng" and unified_config.get("base_url"): + provider_kwargs.setdefault("base_url", unified_config["base_url"]) + # Get provider instance - search_provider = get_provider(provider_name) + search_provider = get_provider(provider_name, **provider_kwargs) _logger.progress(f"[{search_provider.name}] Searching: {query[:50]}...") diff --git a/src/services/search/consolidation.py b/src/services/search/consolidation.py index 13899ad9..7efb2f4c 100644 --- a/src/services/search/consolidation.py +++ b/src/services/search/consolidation.py @@ -136,6 +136,50 @@ {% endfor %} --- *{{ results|length }} academic papers found via Google Scholar*""", + # ------------------------------------------------------------------------- + # SEARXNG TEMPLATE + # ------------------------------------------------------------------------- + "searxng": """{% if answers %} +### Direct Answers +{% for answer in answers %} +{{ answer }} +{% endfor %} + +--- +{% endif %} +{% if infoboxes %} +{% for infobox in infoboxes %} +## {{ infobox.infobox }}{% if infobox.id %} ({{ infobox.id }}){% endif %} + +{{ infobox.content }} +{% if infobox.urls %} +{% for url in infobox.urls[:3] %} +- [{{ url.title }}]({{ url.url }}) +{% endfor %} +{% endif %} + +--- +{% endfor %} +{% endif %} +### Search Results for "{{ query }}" + +{% for result in results[:max_results] %} +**[{{ loop.index }}] {{ result.title }}** +{{ result.snippet }} +{% if result.date %}*{{ result.date }}*{% endif %} +{% if result.attributes.engine %}*via {{ result.attributes.engine }}*{% endif %} +{{ result.url }} + +{% endfor %} +{% if suggestions %} +--- +*Suggestions: {% for s in suggestions[:5] %}{{ s }}{% if not loop.last %}, {% endif %}{% endfor %}* +{% endif %} +{% if corrections %} +*Did you mean: {% for c in corrections[:3] %}{{ c }}{% if not loop.last %}, {% endif %}{% endfor %}* +{% endif %} +--- +*{{ results|length }} results from SearXNG metasearch*""", } @@ -157,6 +201,7 @@ class AnswerConsolidator: "serper": "serper", "jina": "jina", "serper_scholar": "serper_scholar", + "searxng": "searxng", } def __init__( @@ -317,6 +362,15 @@ def _build_provider_context(self, response: WebSearchResponse) -> dict[str, Any] context["links"] = metadata.get("links", {}) context["images"] = metadata.get("images", {}) + # ----------------------------------------------------------------- + # SEARXNG-specific context + # ----------------------------------------------------------------- + elif provider_lower == "searxng": + context["answers"] = metadata.get("answers", []) + context["infoboxes"] = metadata.get("infoboxes", []) + context["suggestions"] = metadata.get("suggestions", []) + context["corrections"] = metadata.get("corrections", []) + return context def _consolidate_with_template(self, response: WebSearchResponse) -> str: @@ -329,13 +383,19 @@ def _consolidate_with_template(self, response: WebSearchResponse) -> str: # Build context with provider-specific fields context = self._build_provider_context(response) - _logger.debug( - f"Context has {len(context.get('results', []))} results, {len(context.get('citations', []))} citations" + _logger.info( + f"Context: {len(context.get('results', []))} results, " + f"{len(context.get('citations', []))} citations, max_results={context.get('max_results')}" ) + if context.get("results"): + first_result = context["results"][0] + _logger.debug( + f"First result: title='{first_result.get('title', '')[:50]}', snippet='{first_result.get('snippet', '')[:100]}'..." + ) try: rendered = template.render(**context) - _logger.debug("Template rendered successfully") + _logger.debug(f"Template rendered ({len(rendered)} chars)") return rendered except Exception as e: _logger.error(f"Template rendering failed: {e}") diff --git a/src/services/search/providers/__init__.py b/src/services/search/providers/__init__.py index bd3a0ab1..4ed42116 100644 --- a/src/services/search/providers/__init__.py +++ b/src/services/search/providers/__init__.py @@ -116,7 +116,7 @@ def get_default_provider(**kwargs) -> BaseSearchProvider: # Auto-import all providers to trigger registration -from . import baidu, exa, jina, perplexity, serper, tavily +from . import baidu, exa, jina, perplexity, searxng, serper, tavily __all__ = [ "register_provider", diff --git a/src/services/search/providers/searxng.py b/src/services/search/providers/searxng.py new file mode 100644 index 00000000..1257dc94 --- /dev/null +++ b/src/services/search/providers/searxng.py @@ -0,0 +1,275 @@ +""" +SearXNG Metasearch Provider + +SearXNG is a free, open-source metasearch engine that aggregates results +from multiple search engines while protecting user privacy. + +Features: +- Privacy-focused: No API keys required for self-hosted instances +- Configurable: Choose specific engines and categories +- Free: No per-query costs +- Self-hosted: Full control over search sources + +Configuration: +- SEARCH_BASE_URL: Base URL of SearXNG instance (default: http://localhost:8888) + +Note: JSON format must be enabled in SearXNG settings.yml: + search: + formats: + - html + - json +""" + +from datetime import datetime +import os +from typing import Any + +import requests + +from ..base import BaseSearchProvider +from ..types import Citation, SearchResult, WebSearchResponse +from . import register_provider + + +class SearXNGAPIError(Exception): + """SearXNG API error""" + + pass + + +@register_provider("searxng") +class SearXNGProvider(BaseSearchProvider): + """SearXNG metasearch engine provider""" + + display_name = "SearXNG" + description = "Privacy-focused metasearch engine" + requires_api_key = False + supports_answer = False + + DEFAULT_BASE_URL = "http://localhost:8888" + DEFAULT_ENGINES = "brave,bing,wikipedia,wikidata,wikinews" + DEFAULT_CATEGORIES = "general" + + def __init__(self, api_key: str | None = None, **kwargs: Any) -> None: + """ + Initialize SearXNG provider. + + Args: + api_key: Not used (SearXNG doesn't require API key). + **kwargs: Additional configuration options. + """ + super().__init__(api_key=api_key, **kwargs) + self.base_url = ( + kwargs.get("base_url") or os.environ.get("SEARCH_BASE_URL") or self.DEFAULT_BASE_URL + ).rstrip("/") + + def search( + self, + query: str, + categories: str | None = None, + engines: str | None = None, + language: str | None = None, + time_range: str | None = None, + page: int = 1, + safesearch: int = 1, + timeout: int = 30, + **kwargs: Any, + ) -> WebSearchResponse: + """ + Perform search using SearXNG API. + + Args: + query: Search query. + categories: Comma-separated categories (e.g., 'general,science'). + engines: Comma-separated engines (e.g., 'google,duckduckgo'). + language: Language code (e.g., "en", "es"); omit or use "auto" for defaults. + time_range: Time filter ('day', 'month', 'year'). + page: Page number (default 1). + safesearch: Safe search level (0, 1, 2). + timeout: Request timeout in seconds. + **kwargs: Additional SearXNG parameters. + + Returns: + WebSearchResponse: Standardized search response. + """ + effective_engines = engines or os.environ.get("SEARXNG_ENGINES") or self.DEFAULT_ENGINES + effective_categories = ( + categories or os.environ.get("SEARXNG_CATEGORIES") or self.DEFAULT_CATEGORIES + ) + effective_language = None if not language or language == "auto" else language + self.logger.debug( + f"Request: base_url={self.base_url}, language={effective_language or 'auto'}, " + f"categories={effective_categories}, engines={effective_engines}" + ) + + params: dict[str, Any] = { + "q": query, + "format": "json", + "pageno": page, + "safesearch": safesearch, + } + + if effective_language: + params["language"] = effective_language + + if effective_categories: + params["categories"] = effective_categories + + if effective_engines: + params["engines"] = effective_engines + if time_range: + params["time_range"] = time_range + + params.update(kwargs) + + search_endpoint = f"{self.base_url}/search" + + headers = { + "Accept": "application/json", + "User-Agent": "DeepTutor/1.0 (SearXNG API Client)", + } + if effective_language: + headers["Accept-Language"] = effective_language + + self.logger.debug(f"Endpoint: {search_endpoint}") + self.logger.debug(f"Query params: {params}") + + try: + response = requests.get( + search_endpoint, + params=params, + headers=headers, + timeout=timeout, + ) + self.logger.debug(f"Request URL: {response.url}") + except requests.exceptions.RequestException as e: + self.logger.error(f"SearXNG request failed: {e}") + raise SearXNGAPIError(f"SearXNG request failed: {e}") from e + + if response.status_code == 403: + self.logger.error( + "SearXNG returned 403 Forbidden. " + "JSON format must be enabled in SearXNG settings.yml: " + "search.formats: [html, json]" + ) + raise SearXNGAPIError( + "SearXNG API returned 403 Forbidden. " + "Ensure JSON format is enabled in your SearXNG instance settings.yml: " + "search:\n formats:\n - html\n - json" + ) + + if response.status_code != 200: + self.logger.error(f"SearXNG API error: {response.status_code} - {response.text}") + raise SearXNGAPIError(f"SearXNG API error: {response.status_code} - {response.text}") + + try: + data = response.json() + except ValueError as e: + self.logger.error(f"SearXNG returned invalid JSON: {e}") + raise SearXNGAPIError(f"SearXNG returned invalid JSON: {e}") from e + + self.logger.debug(f"Response status: {response.status_code}") + self.logger.debug(f"Results count: {len(data.get('results', []))}") + self.logger.debug(f"Answers count: {len(data.get('answers', []))}") + + unresponsive = data.get("unresponsive_engines", []) + if unresponsive: + self.logger.warning(f"Unresponsive engines: {unresponsive}") + + if not data.get("results") and unresponsive: + engine_errors = ", ".join([f"{e[0]}({e[1]})" for e in unresponsive]) + self.logger.warning(f"No results - engines failed: {engine_errors}") + elif not data.get("results"): + self.logger.debug("No results returned for query") + + citations: list[Citation] = [] + search_results: list[SearchResult] = [] + + for i, result in enumerate(data.get("results", []), 1): + title = result.get("title", "") + url = result.get("url", "") + snippet = result.get("content", "") + date = result.get("publishedDate", "") + engine = result.get("engine", "") + category = result.get("category", "web") + score = result.get("score", 0.0) + + attributes: dict[str, Any] = {} + if result.get("img_src"): + attributes["img_src"] = result["img_src"] + if engine: + attributes["engine"] = engine + + sr = SearchResult( + title=title, + url=url, + snippet=snippet, + date=date, + source=engine, + score=score, + attributes=attributes, + ) + search_results.append(sr) + + citations.append( + Citation( + id=i, + reference=f"[{i}]", + url=url, + title=title, + snippet=snippet, + date=date, + source=engine, + type=category, + ) + ) + + raw_answers = data.get("answers", []) + + answer_texts = [] + for ans in raw_answers: + if isinstance(ans, str): + answer_texts.append(ans) + elif isinstance(ans, dict) and ans.get("content"): + answer_texts.append(ans["content"]) + + answer = "\n\n".join(answer_texts) if answer_texts else "" + + if not answer and search_results: + answer = search_results[0].snippet + + metadata: dict[str, Any] = { + "finish_reason": "stop", + "base_url": self.base_url, + "answers": answer_texts, + "infoboxes": data.get("infoboxes", []), + "suggestions": data.get("suggestions", []), + "corrections": data.get("corrections", []), + } + + self.logger.info(f"Returned {len(search_results)} results for query: {query[:50]}") + + return WebSearchResponse( + query=query, + answer=answer, + provider="searxng", + timestamp=datetime.now().isoformat(), + model="searxng", + citations=citations, + search_results=search_results, + usage={}, + metadata=metadata, + ) + + def is_available(self) -> bool: + """ + Check if SearXNG instance is reachable. + + Returns: + bool: True if instance responds, False otherwise. + """ + try: + response = requests.get(f"{self.base_url}/", timeout=5) + return response.status_code == 200 + except Exception: + return False diff --git a/web/app/settings/components/ConfigForm.tsx b/web/app/settings/components/ConfigForm.tsx index de398cd9..ee8c62c4 100644 --- a/web/app/settings/components/ConfigForm.tsx +++ b/web/app/settings/components/ConfigForm.tsx @@ -54,9 +54,15 @@ export default function ConfigForm({ return !!(value && typeof value === "object" && "use_env" in value); }; + // Helper to check if a value is masked (hidden for security) + const isMaskedValue = (value: any): boolean => { + return value === "***"; + }; + // Helper to get display value const getDisplayValue = (value: any): string => { if (isEnvReference(value)) return ""; + if (isMaskedValue(value)) return ""; return typeof value === "string" ? value : ""; }; @@ -68,6 +74,11 @@ export default function ConfigForm({ ? isEnvReference(editConfig.api_key) : false; + // Track if original API key was masked (stored key that shouldn't be overwritten if empty) + const originalApiKeyWasMasked = editConfig + ? isMaskedValue(editConfig.api_key) + : false; + // Form state const [name, setName] = useState(editConfig?.name || ""); const [provider, setProvider] = useState( @@ -180,18 +191,31 @@ export default function ConfigForm({ const payload: Record = { name, provider, - api_key: isLocalProvider + }; + + // Determine the API key value to send + // In edit mode, if the original key was masked and user didn't enter a new one, + // don't include api_key in payload to preserve the existing stored key + const shouldPreserveExistingApiKey = + isEditMode && originalApiKeyWasMasked && !apiKey && !useEnvApiKey; + + if (!shouldPreserveExistingApiKey) { + payload.api_key = isLocalProvider ? apiKey || "" : useEnvApiKey ? { use_env: getEnvVarForApiKey(configType) } - : apiKey, - }; + : apiKey; + } if (!isSearchConfig) { payload.base_url = useEnvBaseUrl ? { use_env: getEnvVarForBaseUrl(configType) } : baseUrl; payload.model = model; + } else if (isSearchConfig && provider === "searxng") { + payload.base_url = useEnvBaseUrl + ? { use_env: "SEARCH_BASE_URL" } + : baseUrl; } if (showDimensions) { @@ -296,8 +320,8 @@ export default function ConfigForm({ - {/* Base URL (not for search) */} - {!isSearchConfig && ( + {/* Base URL (not for search, except searxng) */} + {(!isSearchConfig || (isSearchConfig && provider === "searxng")) && (