Skip to content

Commit aca86b6

Browse files
committed
improve tests coverage and schema api
1 parent 422b47d commit aca86b6

File tree

17 files changed

+466
-117
lines changed

17 files changed

+466
-117
lines changed

README.md

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,33 @@ Vectorize Iris is a model-based extraction solution that transforms how RAG syst
88

99
Documentation: [docs.vectorize.io](https://docs.vectorize.io/build-deploy/extract-information/extraction-tester/#vectorize-iris)
1010

11+
## Table of Contents
12+
13+
- [Why Iris?](#why-iris)
14+
- [Quick Start](#quick-start)
15+
- [Installation](#installation)
16+
- [Features](#features)
17+
- [Basic Text Extraction](#basic-text-extraction)
18+
- [Smart Chunking](#smart-chunking)
19+
- [Metadata Extraction](#metadata-extraction)
20+
- [Parsing Instructions](#parsing-instructions)
21+
- [CLI Examples](#cli-examples)
22+
- [Basic Extraction](#basic-extraction)
23+
- [Extract from URL](#extract-from-url)
24+
- [JSON Output](#json-output-for-piping)
25+
- [Plain Text Output](#plain-text-output)
26+
- [Save to File](#save-to-file)
27+
- [Process Directory](#process-directory)
28+
- [Chunking for RAG](#chunking-for-rag)
29+
- [Custom Parsing Instructions](#custom-parsing-instructions)
30+
- [Document Classification](#document-classification)
31+
- [Advanced Options](#advanced-options)
32+
- [Configuration](#configuration)
33+
- [CLI Configuration](#cli-configuration)
34+
- [Python & Node.js Configuration](#python--nodejs-configuration)
35+
- [Documentation](#documentation)
36+
- [License](#license)
37+
- [Support](#support)
1138

1239
## Why Iris?
1340

@@ -334,7 +361,7 @@ vectorize-iris configure --manual
334361
```
335362

336363
You'll be asked to enter:
337-
- API Token
364+
- Access Token
338365
- Organization ID
339366

340367
Get these from [platform.vectorize.io](https://platform.vectorize.io) → Account → Org Settings → Access Tokens

examples/classification.sh

Lines changed: 0 additions & 73 deletions
This file was deleted.

nodejs-api/jest.config.js

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,4 @@ module.exports = {
1616
}
1717
}]
1818
},
19-
transformIgnorePatterns: [
20-
'node_modules/(?!(node-fetch)/)',
21-
],
22-
extensionsToTreatAsEsm: ['.ts'],
23-
globals: {
24-
'ts-jest': {
25-
useESM: true,
26-
},
27-
},
2819
};

nodejs-api/package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,13 @@
2525
"devDependencies": {
2626
"@types/jest": "^29.5.0",
2727
"@types/node": "^20.0.0",
28+
"@types/node-fetch": "^2.6.0",
2829
"jest": "^29.7.0",
2930
"ts-jest": "^29.1.0",
3031
"typescript": "^5.0.0"
3132
},
3233
"dependencies": {
33-
"node-fetch": "^3.3.2"
34+
"node-fetch": "^2.7.0"
3435
},
3536
"files": [
3637
"dist"

nodejs-api/src/index.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,13 @@ async function _extractFromBuffer(
9999
// Add metadata (default inferSchema to true)
100100
const inferSchema = options.inferMetadataSchema !== undefined ? options.inferMetadataSchema : true;
101101
if (options.metadataSchemas || inferSchema) {
102+
// Convert schema objects to JSON strings if needed
103+
const normalizedSchemas = options.metadataSchemas?.map(s => ({
104+
id: s.id,
105+
schema: typeof s.schema === 'string' ? s.schema : JSON.stringify(s.schema)
106+
}));
102107
extractionRequest.metadata = {
103-
schemas: options.metadataSchemas,
108+
schemas: normalizedSchemas,
104109
inferSchema: inferSchema
105110
};
106111
}

nodejs-api/src/types.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,8 @@ export interface StartFileUploadRequest {
1111

1212
export interface MetadataExtractionStrategySchema {
1313
id: string;
14-
schema: string;
14+
/** Schema definition - can be a JSON string or an object (will be converted to JSON string) */
15+
schema: string | Record<string, unknown>;
1516
}
1617

1718
export interface MetadataExtractionStrategy {
@@ -65,7 +66,7 @@ export interface ExtractionResult {
6566
// Options for extract functions
6667

6768
export interface ExtractionOptions {
68-
/** Vectorize API token (defaults to VECTORIZE_TOKEN env var) */
69+
/** Vectorize access token (defaults to VECTORIZE_TOKEN env var) */
6970
apiToken?: string;
7071
/** Organization ID (defaults to VECTORIZE_ORG_ID env var) */
7172
orgId?: string;

nodejs-api/tests/integration.test.ts

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,6 @@ describeIfCredentials('Integration Tests', () => {
2727

2828
it('should extract with chunking', async () => {
2929
const options: ExtractionOptions = {
30-
chunkingStrategy: 'markdown',
3130
chunkSize: 512
3231
};
3332

@@ -79,7 +78,6 @@ describeIfCredentials('Integration Tests', () => {
7978

8079
it('should extract with all options', async () => {
8180
const options: ExtractionOptions = {
82-
chunkingStrategy: 'markdown',
8381
chunkSize: 256,
8482
metadataSchemas: [
8583
{
@@ -121,7 +119,6 @@ describeIfCredentials('Integration Tests', () => {
121119
const fileName = path.basename(TEST_FILE);
122120

123121
const options: ExtractionOptions = {
124-
chunkingStrategy: 'markdown',
125122
chunkSize: 512
126123
};
127124

nodejs-api/tests/types.test.ts

Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
/**
2+
* Unit tests for type handling and schema conversion
3+
*/
4+
5+
import type { MetadataExtractionStrategySchema, ExtractionOptions } from '../src/types';
6+
7+
describe('MetadataExtractionStrategySchema', () => {
8+
it('should accept schema as a string', () => {
9+
const schema: MetadataExtractionStrategySchema = {
10+
id: 'test',
11+
schema: '{"invoice_number": "string", "total": "number"}'
12+
};
13+
expect(schema.id).toBe('test');
14+
expect(schema.schema).toBe('{"invoice_number": "string", "total": "number"}');
15+
});
16+
17+
it('should accept schema as an object', () => {
18+
const schema: MetadataExtractionStrategySchema = {
19+
id: 'test',
20+
schema: { invoice_number: 'string', total: 'number' }
21+
};
22+
expect(schema.id).toBe('test');
23+
expect(schema.schema).toEqual({ invoice_number: 'string', total: 'number' });
24+
});
25+
26+
it('should accept nested object schemas', () => {
27+
const schema: MetadataExtractionStrategySchema = {
28+
id: 'invoice-data',
29+
schema: {
30+
invoice_number: 'string',
31+
date: 'string',
32+
total_amount: 'number',
33+
vendor_name: 'string',
34+
items: [{
35+
description: 'string',
36+
quantity: 'number',
37+
price: 'number'
38+
}]
39+
}
40+
};
41+
expect(schema.id).toBe('invoice-data');
42+
expect(typeof schema.schema).toBe('object');
43+
});
44+
});
45+
46+
describe('ExtractionOptions with metadataSchemas', () => {
47+
it('should accept metadataSchemas with string schema', () => {
48+
const options: ExtractionOptions = {
49+
metadataSchemas: [{
50+
id: 'doc-info',
51+
schema: 'Extract title, author, and main topics'
52+
}]
53+
};
54+
expect(options.metadataSchemas).toHaveLength(1);
55+
expect(options.metadataSchemas![0].schema).toBe('Extract title, author, and main topics');
56+
});
57+
58+
it('should accept metadataSchemas with object schema', () => {
59+
const options: ExtractionOptions = {
60+
metadataSchemas: [{
61+
id: 'invoice-data',
62+
schema: {
63+
invoice_number: 'string',
64+
date: 'string',
65+
total_amount: 'number',
66+
vendor_name: 'string',
67+
items: [{
68+
description: 'string',
69+
quantity: 'number',
70+
price: 'number'
71+
}]
72+
}
73+
}]
74+
};
75+
expect(options.metadataSchemas).toHaveLength(1);
76+
expect(options.metadataSchemas![0].id).toBe('invoice-data');
77+
expect(typeof options.metadataSchemas![0].schema).toBe('object');
78+
});
79+
80+
it('should accept mixed string and object schemas', () => {
81+
const options: ExtractionOptions = {
82+
metadataSchemas: [
83+
{
84+
id: 'string-schema',
85+
schema: 'Extract basic info'
86+
},
87+
{
88+
id: 'object-schema',
89+
schema: { field: 'string' }
90+
}
91+
]
92+
};
93+
expect(options.metadataSchemas).toHaveLength(2);
94+
expect(typeof options.metadataSchemas![0].schema).toBe('string');
95+
expect(typeof options.metadataSchemas![1].schema).toBe('object');
96+
});
97+
});
98+
99+
describe('Schema conversion to JSON string', () => {
100+
it('should convert object schema to JSON string for API request', () => {
101+
const schemas: MetadataExtractionStrategySchema[] = [
102+
{
103+
id: 'test',
104+
schema: { invoice_number: 'string', total: 'number' }
105+
}
106+
];
107+
108+
// Simulate the conversion that happens in index.ts
109+
const normalizedSchemas = schemas.map(s => ({
110+
id: s.id,
111+
schema: typeof s.schema === 'string' ? s.schema : JSON.stringify(s.schema)
112+
}));
113+
114+
expect(normalizedSchemas[0].schema).toBe('{"invoice_number":"string","total":"number"}');
115+
});
116+
117+
it('should keep string schema as is', () => {
118+
const schemas: MetadataExtractionStrategySchema[] = [
119+
{
120+
id: 'test',
121+
schema: '{"invoice_number": "string"}'
122+
}
123+
];
124+
125+
// Simulate the conversion that happens in index.ts
126+
const normalizedSchemas = schemas.map(s => ({
127+
id: s.id,
128+
schema: typeof s.schema === 'string' ? s.schema : JSON.stringify(s.schema)
129+
}));
130+
131+
expect(normalizedSchemas[0].schema).toBe('{"invoice_number": "string"}');
132+
});
133+
134+
it('should handle nested object schemas', () => {
135+
const schemas: MetadataExtractionStrategySchema[] = [
136+
{
137+
id: 'invoice-data',
138+
schema: {
139+
invoice_number: 'string',
140+
items: [{
141+
description: 'string',
142+
quantity: 'number'
143+
}]
144+
}
145+
}
146+
];
147+
148+
// Simulate the conversion that happens in index.ts
149+
const normalizedSchemas = schemas.map(s => ({
150+
id: s.id,
151+
schema: typeof s.schema === 'string' ? s.schema : JSON.stringify(s.schema)
152+
}));
153+
154+
const parsed = JSON.parse(normalizedSchemas[0].schema);
155+
expect(parsed.invoice_number).toBe('string');
156+
expect(parsed.items).toHaveLength(1);
157+
expect(parsed.items[0].description).toBe('string');
158+
});
159+
});

python-api/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ dev-dependencies = [
3232
"pytest>=7.0.0",
3333
"pytest-cov>=4.0.0",
3434
"pytest-asyncio>=0.21.0",
35+
"pytest-xdist>=3.0.0",
3536
]
3637

3738
[tool.pytest.ini_options]

0 commit comments

Comments
 (0)