Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,5 @@ report.[0-9]_.[0-9]_.[0-9]_.[0-9]_.json
.turbo


apps/demo
apps/demo
apps/examples/test
12 changes: 8 additions & 4 deletions apps/examples/support-ticket-search/lib/unrag/core/ingest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,13 @@ export const ingest = async (

const chunkingOptions = {
...config.defaults,
...input.chunking
...input.chunking,
sourceId: input.sourceId,
metadata: input.metadata ?? {}
}

const chunker = input.chunker ?? config.chunker

const metadata = input.metadata ?? {}
const documentId = config.idGenerator()
const assets: AssetInput[] = Array.isArray(input.assets) ? input.assets : []
Expand Down Expand Up @@ -126,7 +130,7 @@ export const ingest = async (
const prepared: PreparedChunk[] = []
const warnings: IngestWarning[] = []

const baseTextChunks = config.chunker(input.content, chunkingOptions)
const baseTextChunks = chunker(input.content, chunkingOptions)
for (const c of baseTextChunks) {
prepared.push({
chunk: {
Expand Down Expand Up @@ -215,7 +219,7 @@ export const ingest = async (
.filter((t) => t.content.trim().length > 0)

for (const item of nonEmptyItems) {
const chunks = config.chunker(item.content, chunkingOptions)
const chunks = chunker(item.content, chunkingOptions)
for (const c of chunks) {
outSpecs.push({
documentId,
Expand Down Expand Up @@ -405,7 +409,7 @@ export const ingest = async (
storedTokenCount: storedCaptionTokenCount
})
} else if (caption) {
const captionChunks = config.chunker(caption, chunkingOptions)
const captionChunks = chunker(caption, chunkingOptions)
for (const c of captionChunks) {
specs.push({
documentId,
Expand Down
7 changes: 7 additions & 0 deletions apps/examples/support-ticket-search/lib/unrag/core/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -702,6 +702,13 @@ export type IngestInput = {
sourceId: string
content: string
metadata?: Metadata
/**
* Per-ingest chunker override.
*
* Use this to switch chunking algorithms for a single ingest call without
* changing the engine's configured chunker.
*/
chunker?: Chunker
chunking?: Partial<ChunkingOptions>
/** Optional rich media attached to the document. */
assets?: AssetInput[]
Expand Down
7 changes: 7 additions & 0 deletions apps/web/app/api/_lib/registry-manifest.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@ export type RegistryManifest = {
version: number
extractors: Array<{id: string; status?: 'available' | 'coming-soon'}>
connectors: Array<{id: string; status?: 'available' | 'coming-soon'}>
chunkers?: Array<{
id: string
label?: string
description?: string
status?: 'available' | 'coming-soon'
docsPath?: string | null
}>
batteries?: Array<{
id: string
status?: 'available' | 'coming-soon'
Expand Down
109 changes: 107 additions & 2 deletions apps/web/app/api/presets/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ type WizardStateV1 = {
extractors: string[]
connectors: string[]
batteries?: string[]
chunkers?: string[]
}
chunking?: {
method?: string
minChunkSize?: number
model?: string
language?: string
}
defaults: {
chunkSize: number
Expand Down Expand Up @@ -63,8 +70,17 @@ type PresetPayloadV1 = {
extractors: string[]
connectors: string[]
batteries: string[]
chunkers: string[]
}
config: {
chunking?: {
method?: string
options?: {
minChunkSize?: number
model?: string
language?: string
}
}
defaults: {
chunking: {chunkSize: number; chunkOverlap: number}
retrieval: {topK: number}
Expand Down Expand Up @@ -110,6 +126,15 @@ function isWizardStateV1(x: unknown): x is WizardStateV1 {
) {
return false
}
if (
o.modules &&
typeof o.modules === 'object' &&
'chunkers' in o.modules &&
(o.modules as Record<string, unknown>).chunkers != null &&
!Array.isArray((o.modules as Record<string, unknown>).chunkers)
) {
return false
}
return true
}

Expand All @@ -127,6 +152,36 @@ function normalizeWizardState(input: WizardStateV1): WizardStateV1 {
const batteries = Array.isArray(input.modules.batteries)
? input.modules.batteries.map(String).filter(Boolean)
: []
const chunkers = Array.isArray(input.modules.chunkers)
? input.modules.chunkers.map(String).filter(Boolean)
: []

const chunkingMethod = String(input.chunking?.method ?? 'recursive')
.trim()
.toLowerCase()
const chunkingMinChunkSize = Number(input.chunking?.minChunkSize) || 24
const chunkingModelRaw = String(input.chunking?.model ?? '').trim()
const chunkingLanguageRaw = String(input.chunking?.language ?? '').trim()

// These sentinels are used by the /install wizard UI. They should not be
// emitted into presets/config as literal values.
const CHUNKER_MODEL_DEFAULT_VALUE = '__default__'
const AUTO_LANGUAGE_VALUE = '__auto__'

const chunkingModel =
chunkingModelRaw === CHUNKER_MODEL_DEFAULT_VALUE ? '' : chunkingModelRaw
const chunkingLanguage =
chunkingLanguageRaw === AUTO_LANGUAGE_VALUE ? '' : chunkingLanguageRaw

const isBuiltInMethod =
chunkingMethod === 'recursive' ||
chunkingMethod === 'token' ||
chunkingMethod === 'custom'
const ensuredChunkers = isBuiltInMethod
? chunkers
: chunkers.includes(chunkingMethod)
? chunkers
: [...chunkers, chunkingMethod].sort()

const chunkSize = Number(input.defaults.chunkSize) || 200
const chunkOverlap = Number(input.defaults.chunkOverlap) || 40
Expand Down Expand Up @@ -160,7 +215,13 @@ function normalizeWizardState(input: WizardStateV1): WizardStateV1 {
return {
v: 1,
install: {installDir, storeAdapter, aliasBase},
modules: {extractors, connectors, batteries},
modules: {extractors, connectors, batteries, chunkers: ensuredChunkers},
chunking: {
method: chunkingMethod,
minChunkSize: chunkingMinChunkSize,
...(chunkingModel ? {model: chunkingModel} : {}),
...(chunkingLanguage ? {language: chunkingLanguage} : {})
},
defaults: {chunkSize, chunkOverlap, topK},
embedding: {
type: embeddingType,
Expand All @@ -174,6 +235,8 @@ function normalizeWizardState(input: WizardStateV1): WizardStateV1 {

function makePresetFromWizard(state: WizardStateV1): PresetPayloadV1 {
const assetProcessing = state.engine?.assetProcessing
const CHUNKER_MODEL_DEFAULT_VALUE = '__default__'
const AUTO_LANGUAGE_VALUE = '__auto__'
return {
version: 1,
createdAt: new Date().toISOString(),
Expand All @@ -187,9 +250,35 @@ function makePresetFromWizard(state: WizardStateV1): PresetPayloadV1 {
connectors: state.modules.connectors,
batteries: (state.modules.batteries ?? [])
.map(String)
.filter(Boolean)
.filter(Boolean),
chunkers: (state.modules.chunkers ?? []).map(String).filter(Boolean)
},
config: {
...(state.chunking?.method
? {
chunking: {
method: state.chunking.method,
options: {
...(typeof state.chunking.minChunkSize ===
'number'
? {
minChunkSize:
state.chunking.minChunkSize
}
: {}),
...(state.chunking.model &&
state.chunking.model !==
CHUNKER_MODEL_DEFAULT_VALUE
? {model: state.chunking.model}
: {}),
...(state.chunking.language &&
state.chunking.language !== AUTO_LANGUAGE_VALUE
? {language: state.chunking.language}
: {})
}
}
}
: {}),
defaults: {
chunking: {
chunkSize: state.defaults.chunkSize,
Expand Down Expand Up @@ -281,6 +370,11 @@ export async function POST(req: NextRequest) {
.filter((b) => b.status === 'available')
.map((b) => b.id)
)
const allowedChunkers = new Set(
(manifest.chunkers ?? [])
.filter((c) => c.status === 'available')
.map((c) => c.id)
)

const unknownExtractors = state.modules.extractors.filter(
(x) => !allowedExtractors.has(x)
Expand Down Expand Up @@ -313,6 +407,17 @@ export async function POST(req: NextRequest) {
)
}

const chunkerIds = (state.modules.chunkers ?? [])
.map(String)
.filter(Boolean)
const unknownChunkers = chunkerIds.filter((x) => !allowedChunkers.has(x))
if (unknownChunkers.length > 0) {
return NextResponse.json(
{error: 'Unknown or unavailable chunkers', unknownChunkers},
{status: 400}
)
}

const preset = makePresetFromWizard(state)
const id = newPresetId()
const key = `unrag:preset:${id}`
Expand Down
Loading