weaver/env.example at master · guangtouwangba/weaver · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# ============================================================================
# Weaver - Environment Configuration
# ============================================================================
#
# Quick Start:
#   cp env.example .env
#   # Edit the REQUIRED section below, then:
#   cd app/backend && docker compose up -d
#   make run-backend   # Terminal 1
#   make run-frontend  # Terminal 2
#
# This single .env file is used by BOTH backend and frontend.
# The backend reads it directly; the frontend reads it via next.config.ts.
#
# ============================================================================


# ============================================================================
# REQUIRED — You must set these to run Weaver
# ============================================================================

# --- LLM API Key (pick one) ---
# Option 1: OpenRouter (recommended — one key for many models)
#   Get your key at: https://openrouter.ai/keys
OPENROUTER_API_KEY=

# Option 2: OpenAI directly
#   Get your key at: https://platform.openai.com/api-keys
# OPENAI_API_KEY=

# Option 3: Anthropic Claude (official API)
#   Get your key at: https://platform.claude.com/settings/keys
#   Docs: https://docs.anthropic.com/en/api/getting-started
# ANTHROPIC_API_KEY=

# --- Database ---
# Default works with `docker compose up -d` out of the box.
DATABASE_URL=postgresql://research_rag:research_rag_dev@localhost:5432/research_rag

# --- Redis (task queue) ---
# Default works with `docker compose up -d` out of the box.
REDIS_URL=redis://localhost:6379


# ============================================================================
# RECOMMENDED — Needed for specific features
# ============================================================================

# Google Gemini API (for PDF OCR and audio transcription)
#   Get your key at: https://aistudio.google.com/apikey
#   Free tier available. Required for: scanned PDF processing, video transcription.
GOOGLE_API_KEY=

# Authentication — choose one approach:
#
# Option A: Skip auth entirely (easiest for local dev)
AUTH_BYPASS_ENABLED=true
#
# Option B: Use Supabase auth (production / multi-user)
# AUTH_BYPASS_ENABLED=false
# NEXT_PUBLIC_SUPABASE_URL=https://your-project.supabase.co
# NEXT_PUBLIC_SUPABASE_ANON_KEY=your-anon-key
# SUPABASE_JWT_SECRET=your-jwt-secret


# ============================================================================
# OPTIONAL — Sensible defaults, change only if needed
# ============================================================================

# --- LLM Model Selection ---
LLM_MODEL=google/gemini-2.5-flash
EMBEDDING_MODEL=openai/text-embedding-3-small

# --- Local Deployment (optional — for running without cloud APIs) ---
# Uncomment these to use Ollama or another local LLM server.
# See docs/LOCAL_DEPLOYMENT.md for setup instructions.
#
# LLM_PROVIDER_TYPE=local               # openrouter (default) | openai | anthropic | local
# LOCAL_LLM_BASE_URL=http://localhost:11434/v1
# LOCAL_LLM_MODEL=llama3.2:3b
# LOCAL_LLM_API_KEY=ollama
#
# EMBEDDING_PROVIDER_TYPE=local          # openrouter (default) | openai | local
# EMBEDDING_DIMENSION=768               # Must match your embedding model (768 for nomic-embed-text)
# LOCAL_EMBEDDING_BASE_URL=http://localhost:11434/v1
# LOCAL_EMBEDDING_MODEL=nomic-embed-text

# --- RAG Configuration ---
# traditional: chunk-based retrieval (default)
# long_context: full document context (NotebookLM style)
# auto: choose based on document size
RAG_MODE=traditional
RETRIEVAL_TOP_K=5
RETRIEVAL_MIN_SIMILARITY=0.0

# Long context mode settings
LONG_CONTEXT_SAFETY_RATIO=0.55
LONG_CONTEXT_MIN_TOKENS=10000

# Citation settings
ENABLE_CITATION_GROUNDING=true
CITATION_FORMAT=both

# Intent classification (disabled by default for faster TTFB)
INTENT_CLASSIFICATION_ENABLED=false
INTENT_CACHE_ENABLED=true

# --- OCR Configuration ---
# auto: unstructured first, fallback to Gemini for scanned PDFs
# unstructured: lightweight parser (no PyTorch)
# gemini: Google Gemini Vision OCR (best quality, requires GOOGLE_API_KEY)
# docling: heavy parser with PyTorch (optional install)
OCR_MODE=auto
GEMINI_OCR_CONCURRENCY=3

# --- Vector Store ---
# pgvector: PostgreSQL extension (default, no extra service needed)
# qdrant: dedicated vector database (better for large-scale)
VECTOR_STORE_PROVIDER=pgvector

# Qdrant settings (only if VECTOR_STORE_PROVIDER=qdrant)
# QDRANT_URL=http://localhost:6333
# QDRANT_API_KEY=
# QDRANT_COLLECTION_NAME=document_chunks

# --- CORS ---
CORS_ORIGINS=http://localhost:3000,http://localhost:3001

# --- General ---
ENVIRONMENT=development
LOG_LEVEL=INFO


# ============================================================================
# VIDEO PLATFORMS — Only if you import from these sources
# ============================================================================

# --- YouTube ---
YOUTUBE_RATE_LIMIT_ENABLED=true
YOUTUBE_MIN_DELAY=2.0
YOUTUBE_MAX_DELAY=60.0
# YOUTUBE_PROXY_URL=               # HTTP/SOCKS proxy for IP bans
# YOUTUBE_COOKIES_PATH=            # Path to cookies.txt for restricted videos

# Gemini audio transcription max duration (minutes)
GEMINI_AUDIO_MAX_DURATION_MINUTES=60

# --- Bilibili ---
# Authentication is OPTIONAL. Works without it via web scraping.
# Only needed for VIP/restricted videos. Use a test account!
BILIBILI_RATE_LIMIT_ENABLED=true
BILIBILI_MIN_DELAY=2.0
BILIBILI_MAX_DELAY=60.0
BILIBILI_ENABLE_YTDLP=true
# BILIBILI_SESSDATA=
# BILIBILI_BILI_JCT=
# BILIBILI_BUVID3=

# --- Douyin/TikTok ---
# Get your key at: https://tikhub.io
# TIKHUB_API_KEY=
# TIKHUB_BASE_URL=https://api.tikhub.io

# Video storage path
VIDEO_STORAGE_PATH=data/videos


# ============================================================================
# PRODUCTION / CLOUD — Not needed for local development
# ============================================================================

# --- Supabase (production database + auth + storage) ---
# DATABASE_CLIENT_TYPE=supabase
# SUPABASE_URL=https://your-project.supabase.co
# SUPABASE_SERVICE_ROLE_KEY=your-service-role-key
# STORAGE_BUCKET=documents

# --- Observability ---
# Langfuse (LLM tracing): https://langfuse.com
LANGFUSE_ENABLED=false
# LANGFUSE_PUBLIC_KEY=
# LANGFUSE_SECRET_KEY=
# LANGFUSE_HOST=https://cloud.langfuse.com

# Loki (centralized logging)
LOKI_ENABLED=false
# LOKI_URL=http://localhost:3100/loki/api/v1/push

# --- Encryption ---
# For encrypting sensitive data in settings storage.
# Generate: python -c "from cryptography.fernet import Fernet; print(Fernet.generate_key().decode())"
# ENCRYPTION_KEY=

# --- URL Extraction ---
URL_EXTRACTION_TIMEOUT=60
URL_CONTENT_MAX_LENGTH=50000
DISABLE_SSRF_CHECK=false