Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
275 changes: 274 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,284 @@

[![Go Reference](https://pkg.go.dev/badge/github.com/sixt/tensorlake-go.svg)](https://pkg.go.dev/github.com/sixt/tensorlake-go)

A comprehensive Go SDK for the [Tensorlake API](https://docs.tensorlake.ai/api-reference/v2/introduction), enabling intelligent document processing with parsing, structured data extraction, and page classification capabilities.

## Features

- **Document Parsing**: Convert PDFs, DOCX, images, and more to structured markdown
- **Data Extraction**: Extract structured data using JSON schemas
- **Page Classification**: Classify pages by content type
- **File Management**: Upload and manage documents
- **Datasets**: Reusable parsing configurations for consistent processing
- **SSE Support**: Real-time progress updates via Server-Sent Events
- **Iterator Pattern**: Easy pagination through results

## Installation

```bash
go get github.com/sixt/tensorlake-go
```

This repository contains an implementation of the [Tensorlake API Reference](https://docs.tensorlake.ai/api-reference/v2/introduction), enabling document parsing, structured data extraction, and page classification, etc.
**Requirements:** Go 1.25 or later

## Quick Start

### 1. Initialize the Client

```go
import "github.com/sixt/tensorlake-go"

c := tensorlake.NewClient(
tensorlake.WithRegion(tensorlake.RegionOnPrem),
tensorlake.WithBaseURL("https://api.your-domain.com"),
tensorlake.WithAPIKey("your-api-key"),
)
```

### 2. Upload a File

```go
file, _ := os.Open("document.pdf")
defer file.Close()

uploadResp, _ := c.UploadFile(context.Background(), &tensorlake.UploadFileRequest{
FileBytes: file,
FileName: "document.pdf",
Labels: map[string]string{"category": "invoice"},
})

fmt.Printf("File uploaded: %s\n", uploadResp.FileId)
```

### 3. Parse the Document

```go
parseJob, _ := c.ParseDocument(context.Background(), &tensorlake.ParseDocumentRequest{
FileSource: tensorlake.FileSource{
FileId: uploadResp.FileId,
},
})

// Get results with real-time updates
result, _ := c.GetParseResult(
context.Background(),
parseJob.ParseId,
tensorlake.WithSSE(true),
tensorlake.WithOnUpdate(func(name tensorlake.ParseEventName, r *tensorlake.ParseResult) {
fmt.Printf("Status: %s - %d/%d pages\n", name, r.ParsedPagesCount, r.TotalPages)
}),
)

// Access parsed content
for _, page := range result.Pages {
fmt.Printf("Page %d:\n", page.PageNumber)
// Process page content...
}
```

## Documentation

### Core APIs

- **[File Management APIs](./docs/file-apis.md)** - Upload, list, retrieve metadata, and delete files
- **[Parse APIs](./docs/parse-apis.md)** - Parse documents, extract data, and classify pages
- **[Dataset APIs](./docs/dataset-apis.md)** - Create reusable parsing configurations

### Comprehensive Examples

#### Extract Structured Data

```go
import "github.com/google/jsonschema-go/jsonschema"

// Define extraction schema
type InvoiceData struct {
InvoiceNumber string `json:"invoice_number"`
VendorName string `json:"vendor_name"`
TotalAmount float64 `json:"total_amount"`
LineItems []LineItem `json:"line_items"`
}

type LineItem struct {
Description string `json:"description"`
Amount float64 `json:"amount"`
}

schema, _ := jsonschema.For[InvoiceData](nil)

// Parse with extraction
parseJob, _ := c.ParseDocument(context.Background(), &tensorlake.ParseDocumentRequest{
FileSource: tensorlake.FileSource{FileId: fileId},
StructuredExtractionOptions: []tensorlake.StructuredExtractionOptions{
{
SchemaName: "invoice_data",
JSONSchema: schema,
PartitionStrategy: tensorlake.PartitionStrategyNone,
ProvideCitations: true,
},
},
})

// Retrieve and unmarshal extracted data
result, _ := c.GetParseResult(context.Background(), parseJob.ParseId)
for _, data := range result.StructuredData {
var extracted map[string]interface{}
json.Unmarshal(data.Data, &extracted)
fmt.Printf("Extracted: %+v\n", extracted)
}
```

#### Classify Pages

```go
parseJob, err := c.ClassifyDocument(context.Background(), &tensorlake.ClassifyDocumentRequest{
FileSource: tensorlake.FileSource{FileId: fileId},
PageClassifications: []tensorlake.PageClassConfig{
{
Name: "signature_page",
Description: "Pages containing signatures or signature blocks",
},
{
Name: "terms_and_conditions",
Description: "Pages with legal terms and conditions",
},
},
})

result, _ := c.GetParseResult(context.Background(), parseJob.ParseId)
for _, pageClass := range result.PageClasses {
fmt.Printf("Class '%s' found on pages: %v\n", pageClass.PageClass, pageClass.PageNumbers)
}
```

#### Use Datasets for Batch Processing

```go
// Create a reusable dataset
dataset, err := c.CreateDataset(context.Background(), &tensorlake.CreateDatasetRequest{
Name: "invoice-processing",
Description: "Standard invoice parsing configuration",
ParsingOptions: &tensorlake.ParsingOptions{
TableOutputMode: tensorlake.TableOutputModeMarkdown,
},
StructuredExtractionOptions: []tensorlake.StructuredExtractionOptions{
{
SchemaName: "invoice",
JSONSchema: schema,
},
},
})

// Process multiple files with the same configuration
fileIds := []string{"file_001", "file_002", "file_003"}
for _, fileId := range fileIds {
parseJob, err := c.ParseDataset(context.Background(), &tensorlake.ParseDatasetRequest{
DatasetId: dataset.DatasetId,
FileSource: tensorlake.FileSource{FileId: fileId},
})
// Process results...
}
```

## Advanced Features

### Server-Sent Events (SSE)

Get real-time progress updates for long-running parse jobs:

```go
result, err := c.GetParseResult(
ctx,
parseId,
tensorlake.WithSSE(true),
tensorlake.WithOnUpdate(func(name tensorlake.ParseEventName, r *tensorlake.ParseResult) {
switch eventName {
case tensorlake.sseEventParseQueued:
fmt.Println("Job queued")
case tensorlake.sseEventParseUpdate:
fmt.Printf("Progress: %d/%d pages\n", r.ParsedPagesCount, r.TotalPages)
case tensorlake.sseEventParseDone:
fmt.Println("Complete!")
}
}),
)
```

### Iterator Pattern

Easily iterate through paginated results:

```go
// Iterate all files
for file, err := range c.IterFiles(ctx, 50, tensorlake.PaginationDirectionNext) {
if err != nil {
panic(err)
}
fmt.Printf("File: %s\n", file.FileName)
}

// Iterate all parse jobs
for job, err := range c.IterParseJobs(ctx, 50, tensorlake.PaginationDirectionNext) {
if err != nil {
panic(err)
}
fmt.Printf("Job %s: Status: %s\n", job.ParseId, job.Status)
}

// Iterate all datasets
for dataset, err := range c.IterDatasets(ctx, 50, tensorlake.PaginationDirectionNext) {
if err != nil {
panic(err)
}
fmt.Printf("Dataset %s: Name: %s, Status: %s\n", dataset.DatasetId, dataset.Name, dataset.Status)
}
```

## Supported File Types

- **Documents**: PDF, DOCX
- **Spreadsheets**: XLS, XLSX, XLSM, CSV
- **Presentations**: PPTX, Apple Keynote
- **Images**: PNG, JPG, JPEG
- **Text**: Plain text, HTML

Maximum file size: 1 GB

## Error Handling

All API methods return structured errors:

```go
result, err := c.ParseDocument(ctx, request)
if err != nil {
var apiErr *tensorlake.ErrorResponse
if errors.As(err, &apiErr) {
fmt.Printf("API Error: %s (Code: %s)\n", apiErr.Message, apiErr.ErrorCode)
// Handle specific error codes
} else {
fmt.Printf("Network/Client Error: %v\n", err)
}
}
```

## Best Practices

1. **Reuse Datasets** - Create datasets for frequently processed document types
2. **Use SSE** - Enable SSE for large documents to track progress
3. **Batch Processing** - Process similar documents with the same dataset configuration
4. **Error Handling** - Always check error responses and handle retries appropriately
5. **Labels** - Use labels to organize and filter files and parse jobs
6. **Iterators** - Use iterator methods for efficient pagination through large result sets

## Contributing

Contributions are welcome! Please feel free to submit issues or pull requests.

## Related Resources

- [Tensorlake API Documentation](https://docs.tensorlake.ai/)
- [API Reference](https://docs.tensorlake.ai/api-reference/v2/introduction)
- [Go Package Documentation](https://pkg.go.dev/github.com/sixt/tensorlake-go)

## License

Expand Down
6 changes: 5 additions & 1 deletion dataset_create.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,16 @@ type CreateDatasetResponse struct {
}

// CreateDataset creates a new dataset.
//
// See also: [Create Dataset API Reference]
//
// [Create Dataset API Reference]: https://docs.tensorlake.ai/api-reference/v2/datasets/create
func (c *Client) CreateDataset(ctx context.Context, in *CreateDatasetRequest) (*CreateDatasetResponse, error) {
b, err := json.Marshal(in)
if err != nil {
return nil, fmt.Errorf("failed to marshal request: %w", err)
}

req, err := http.NewRequestWithContext(ctx, http.MethodPost, c.baseURL+"/datasets", bytes.NewReader(b))
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
Expand Down
4 changes: 4 additions & 0 deletions dataset_delete.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ import (
)

// DeleteDataset deletes a dataset from Tensorlake.
//
// See also: [Delete Dataset API Reference]
//
// [Delete Dataset API Reference]: https://docs.tensorlake.ai/api-reference/v2/datasets/delete
func (c *Client) DeleteDataset(ctx context.Context, datasetId string) error {
reqURL := fmt.Sprintf("%s/datasets/%s", c.baseURL, datasetId)

Expand Down
4 changes: 4 additions & 0 deletions dataset_get.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ import (
)

// GetDataset retrieves details for a specific dataset.
//
// See also: [Get Dataset API Reference]
//
// [Get Dataset API Reference]: https://docs.tensorlake.ai/api-reference/v2/datasets/get
func (c *Client) GetDataset(ctx context.Context, datasetId string) (*Dataset, error) {
reqURL := fmt.Sprintf("%s/datasets/%s", c.baseURL, datasetId)

Expand Down
8 changes: 8 additions & 0 deletions dataset_list.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,10 @@ type ListDatasetsRequest struct {
}

// ListDatasets lists all datasets in the organization.
//
// See also: [List Datasets API Reference]
//
// [List Datasets API Reference]: https://docs.tensorlake.ai/api-reference/v2/datasets/list
func (c *Client) ListDatasets(ctx context.Context, in *ListDatasetsRequest) (*PaginationResult[Dataset], error) {
reqURL := c.baseURL + "/datasets"

Expand Down Expand Up @@ -148,6 +152,10 @@ type ListDatasetDataRequest struct {
// ListDatasetData lists all the parse jobs associated with a specific dataset.
// This endpoint allows you to retrieve the status and metadata of each parse job
// that has been submitted under the specified dataset.
//
// See also: [List Dataset Data API Reference]
//
// [List Dataset Data API Reference]: https://docs.tensorlake.ai/api-reference/v2/datasets/data
func (c *Client) ListDatasetData(ctx context.Context, in *ListDatasetDataRequest) (*PaginationResult[ParseResult], error) {
reqURL := fmt.Sprintf("%s/datasets/%s/data", c.baseURL, in.DatasetId)
params := url.Values{}
Expand Down
4 changes: 4 additions & 0 deletions dataset_parse.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ type ParseDatasetRequest struct {
}

// ParseDataset parses a document using a dataset's configuration.
//
// See also: [Parse Dataset API Reference]
//
// [Parse Dataset API Reference]: https://docs.tensorlake.ai/api-reference/v2/datasets/parse
func (c *Client) ParseDataset(ctx context.Context, in *ParseDatasetRequest) (*ParseJob, error) {
if !in.SourceProvided() {
return nil, fmt.Errorf("exactly one of file_id, file_url, or raw_text must be provided")
Expand Down
4 changes: 2 additions & 2 deletions dataset_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,8 +89,8 @@ func TestDataset(t *testing.T) {
t.Logf("dataset parse job: %+v", p)

// Get parse job results.
r, err := c.GetParseResult(t.Context(), p.ParseId, WithSSE(true), WithOnUpdate(func(eventName string, _ *ParseResult) {
t.Logf("parse status: %s", eventName)
r, err := c.GetParseResult(t.Context(), p.ParseId, WithSSE(true), WithOnUpdate(func(name ParseEventName, _ *ParseResult) {
t.Logf("parse status: %s", name)
}))
if err != nil {
t.Fatalf("failed to get parse job result: %v", err)
Expand Down
4 changes: 4 additions & 0 deletions dataset_update.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ type UpdateDatasetRequest struct {
}

// UpdateDataset updates a dataset's settings.
//
// See also: [Update Dataset API Reference]
//
// [Update Dataset API Reference]: https://docs.tensorlake.ai/api-reference/v2/datasets/update
func (c *Client) UpdateDataset(ctx context.Context, in *UpdateDatasetRequest) (*Dataset, error) {
reqBody, err := json.Marshal(in)
if err != nil {
Expand Down
Loading