From 78259f26c5ac9fe4b2f4221b1decc9935ecea890 Mon Sep 17 00:00:00 2001 From: Vaibhav Raj Date: Tue, 9 Sep 2025 02:13:05 +0530 Subject: [PATCH 1/4] refactor: add convertBuffer to MarkItDown --- src/converters/bingserp.ts | 7 ++- src/converters/docx.ts | 25 ++++---- src/converters/html.ts | 16 +++-- src/converters/image.ts | 63 ++++++++++++-------- src/converters/ipynb.ts | 8 ++- src/converters/media.ts | 6 +- src/converters/mp3.ts | 74 ++++++++++++++--------- src/converters/pdf.ts | 4 +- src/converters/plain-text.ts | 9 ++- src/converters/wav.ts | 68 ++++++++++++--------- src/converters/wikipedia.ts | 17 ++++-- src/converters/xlsx.ts | 15 +++-- src/converters/xml-rss-atom.ts | 7 ++- src/converters/youtube.ts | 7 ++- src/converters/zip.ts | 105 +++++++++++++++++---------------- src/markitdown.ts | 63 ++++++++++---------- src/types.ts | 2 +- 17 files changed, 290 insertions(+), 206 deletions(-) diff --git a/src/converters/bingserp.ts b/src/converters/bingserp.ts index 720f730..018e162 100644 --- a/src/converters/bingserp.ts +++ b/src/converters/bingserp.ts @@ -6,7 +6,7 @@ import { CustomTurnDown } from "../custom-turndown"; export class BingSerpConverter implements DocumentConverter { async convert( - localPath: string, + source: string | Buffer, options: ConverterOptions = {} ): Promise { const fileExtension = options.file_extension || ""; @@ -19,7 +19,10 @@ export class BingSerpConverter implements DocumentConverter { } try { - const htmlContent = fs.readFileSync(localPath, { encoding: "utf-8" }); + const htmlContent = + typeof source === "string" + ? fs.readFileSync(source, { encoding: "utf-8" }) + : Buffer.from(source).toString("utf-8"); return this._convert(htmlContent, url); } catch (error) { console.error("Bing SERP Parsing Error:", error); diff --git a/src/converters/docx.ts b/src/converters/docx.ts index 6e16694..9c47ce4 100644 --- a/src/converters/docx.ts +++ b/src/converters/docx.ts @@ -4,25 +4,26 @@ import { HtmlConverter } from "./html"; import Mammoth from "mammoth"; export class DocxConverter extends HtmlConverter { - async convert(local_path: string, options: ConverterOptions): Promise { + async convert(source: string | Buffer, options: ConverterOptions): Promise { const fileExtension = options.file_extension || ""; if (![".docx"].includes(fileExtension.toLowerCase())) { return null; } try { - let exists = fs.existsSync(local_path); - if (!exists) { - throw new Error("File does'nt exists"); - } - let htmlContent = await Mammoth.convertToHtml( - { - path: local_path - }, - { - ...options + let mammothInput: { path: string } | { buffer: Buffer }; + if (typeof source === "string") { + if (!fs.existsSync(source)) { + throw new Error("File does'nt exists"); } - ); + mammothInput = { path: source }; + } else { + mammothInput = { buffer: Buffer.from(source) }; + } + + let htmlContent = await Mammoth.convertToHtml(mammothInput, { + ...options + }); return await this._convert(htmlContent.value); } catch (e) { diff --git a/src/converters/html.ts b/src/converters/html.ts index fd5c0ca..192507b 100644 --- a/src/converters/html.ts +++ b/src/converters/html.ts @@ -4,18 +4,24 @@ import { CustomTurnDown } from "../custom-turndown"; import { ConverterOptions, ConverterResult, DocumentConverter } from "../types"; export class HtmlConverter implements DocumentConverter { - async convert(local_path: string, options: ConverterOptions): Promise { + async convert(source: string | Buffer, options: ConverterOptions): Promise { const extension = options.file_extension || ""; if (![".html", ".htm"].includes(extension.toLowerCase())) { return null; } try { - let exists = fs.existsSync(local_path); - if (!exists) { - throw new Error("File does'nt exists"); + let content; + if (typeof source === "string") { + let exists = fs.existsSync(source); + if (!exists) { + throw new Error("File does'nt exists"); + } + content = fs.readFileSync(source, { encoding: "utf-8" }); + } else { + content = source.toString("utf-8"); } - let content = fs.readFileSync(local_path, { encoding: "utf-8" }); + return await this._convert(content); } catch (e) { console.error(e); diff --git a/src/converters/image.ts b/src/converters/image.ts index 4bb8069..292d785 100644 --- a/src/converters/image.ts +++ b/src/converters/image.ts @@ -5,7 +5,7 @@ import { generateText } from "ai"; export class ImageConverter extends MediaConverter { async convert( - localPath: string, + source: string | Buffer, options: ConverterOptions = {} ): Promise { const fileExtension = options.file_extension || ""; @@ -14,49 +14,62 @@ export class ImageConverter extends MediaConverter { } try { - return this._convert(localPath, options); + return this._convert(source, options); } catch (error) { console.error("Image Conversion Error:", error); return null; } } - private async _convert(localPath: string, options: ConverterOptions): Promise { + private async _convert( + source: string | Buffer, + options: ConverterOptions + ): Promise { let mdContent = ""; - const metadata = await this._getMetadata(localPath); - if (metadata) { - for (const f of [ - "ImageSize", - "Title", - "Caption", - "Description", - "Keywords", - "Artist", - "Author", - "DateTimeOriginal", - "CreateDate", - "GPSPosition" - ]) { - if (metadata[f]) { - mdContent += `${f}: ${metadata[f]}\n`; + if (typeof source === "string") { + const metadata = await this._getMetadata(source); + if (metadata) { + for (const f of [ + "ImageSize", + "Title", + "Caption", + "Description", + "Keywords", + "Artist", + "Author", + "DateTimeOriginal", + "CreateDate", + "GPSPosition" + ]) { + if (metadata[f]) { + mdContent += `${f}: ${metadata[f]}\n`; + } } } + } else { + console.warn( + "Metadata extraction is skipped for Buffer inputs as it requires a file path for exiftool." + ); } + if (options.llmModel) { - mdContent += `\n# Description:\n${( - await this._getLLMDescription(localPath, options) - ).trim()}\n`; + const imageBuffer = + typeof source === "string" ? fs.readFileSync(source) : Buffer.from(source); + mdContent += `\n# Description:\n${(await this._getLLMDescription(imageBuffer, options)).trim()}\n`; } return { title: null, text_content: mdContent.trim() }; } - private async _getLLMDescription(localPath: string, options: ConverterOptions): Promise { + private async _getLLMDescription( + imageBuffer: Buffer, + options: ConverterOptions + ): Promise { if (!options.llmPrompt || options.llmPrompt.trim() === "") { options.llmPrompt = "Write a detailed caption for this image."; } - const imageFile = fs.readFileSync(localPath).toString("base64"); + const imageFileAsBase64 = imageBuffer.toString("base64"); const result = await generateText({ model: options.llmModel!, @@ -67,7 +80,7 @@ export class ImageConverter extends MediaConverter { { type: "text", text: options.llmPrompt }, { type: "image", - image: imageFile + image: imageFileAsBase64 } ] } diff --git a/src/converters/ipynb.ts b/src/converters/ipynb.ts index b5cb976..03d2290 100644 --- a/src/converters/ipynb.ts +++ b/src/converters/ipynb.ts @@ -3,7 +3,7 @@ import * as fs from "fs"; export class IpynbConverter implements DocumentConverter { async convert( - localPath: string, + source: string | Buffer, options: ConverterOptions = {} ): Promise { const fileExtension = options.file_extension || ""; @@ -11,7 +11,11 @@ export class IpynbConverter implements DocumentConverter { return null; } try { - const notebookContent = JSON.parse(fs.readFileSync(localPath, { encoding: "utf-8" })); + const contentStirng = + typeof source === "string" + ? fs.readFileSync(source, { encoding: "utf-8" }) + : source.toString("utf-8"); + const notebookContent = JSON.parse(contentStirng); return this._convert(notebookContent); } catch (error) { console.error("Error converting .ipynb file:", error); diff --git a/src/converters/media.ts b/src/converters/media.ts index d72ef43..57ff01c 100644 --- a/src/converters/media.ts +++ b/src/converters/media.ts @@ -5,16 +5,16 @@ import * as util from "util"; const exec = util.promisify(childProcess.exec); export abstract class MediaConverter implements DocumentConverter { - abstract convert(localPath: string, options: ConverterOptions): Promise; + abstract convert(source: string | Buffer, options: ConverterOptions): Promise; - async _getMetadata(localPath: string): Promise<{ [key: string]: string } | null> { + async _getMetadata(local_path: string): Promise<{ [key: string]: string } | null> { const exiftool = await this._which("exiftool"); if (!exiftool) { console.error("exiftool is not found on this system so metadata cannot be extracted"); return null; } try { - const result = await exec(`"${exiftool}" -json "${localPath}"`); + const result = await exec(`"${exiftool}" -json "${local_path}"`); return JSON.parse(result.stdout)[0]; } catch (error) { console.error("Exiftool error:", error); diff --git a/src/converters/mp3.ts b/src/converters/mp3.ts index 621180d..9715d98 100644 --- a/src/converters/mp3.ts +++ b/src/converters/mp3.ts @@ -6,7 +6,7 @@ import * as path from "path"; export class Mp3Converter extends WavConverter { async convert( - localPath: string, + source: string | Buffer, options: ConverterOptions = {} ): Promise { const fileExtension = options.file_extension || ""; @@ -14,46 +14,62 @@ export class Mp3Converter extends WavConverter { return null; } try { - return await this._convert$(localPath, options); + return await this._convert$(source, options); } catch (error) { console.error("MP3 Conversion Error:", error); return null; } } - private async _convert$(localPath: string, options: ConverterOptions): Promise { + private async _convert$( + source: string | Buffer, + options: ConverterOptions + ): Promise { let mdContent = ""; - const metadata = await this._getMetadata(localPath); - if (metadata) { - for (const f of [ - "Title", - "Artist", - "Author", - "Band", - "Album", - "Genre", - "Track", - "DateTimeOriginal", - "CreateDate", - "Duration" - ]) { - if (metadata[f]) { - mdContent += `${f}: ${metadata[f]}\n`; + + if (typeof source === "string") { + const metadata = await this._getMetadata(source); + if (metadata) { + for (const f of [ + "Title", + "Artist", + "Author", + "Band", + "Album", + "Genre", + "Track", + "DateTimeOriginal", + "CreateDate", + "Duration" + ]) { + if (metadata[f]) { + mdContent += `${f}: ${metadata[f]}\n`; + } } } + } else { + console.warn( + "Metadata extraction is skipped for Buffer inputs as it requires a file path for exiftool." + ); } - const tempPath = await fs.mkdtemp(path.join(os.tmpdir(), "temp_")); - const wavPath = path.join(tempPath, "audio.wav"); - try { - const transcript = await super._transcribeAudio(wavPath); - mdContent += `\n\n### Audio Transcript:\n${transcript == "" ? "[No speech detected]" : transcript}`; - } catch (e) { - mdContent += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."; - } finally { - await fs.unlink(wavPath); - await fs.rmdir(tempPath); + if (typeof source === "string") { + const tempPath = await fs.mkdtemp(path.join(os.tmpdir(), "temp_")); + const wavPath = path.join(tempPath, "audio.wav"); + try { + const transcript = await super._transcribeAudio(wavPath); + mdContent += `\n\n### Audio Transcript:\n${transcript == "" ? "[No speech detected]" : transcript}`; + } catch (e) { + mdContent += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."; + } finally { + await fs.unlink(wavPath); + await fs.rmdir(tempPath); + } + } else { + mdContent += + "\n\n### Audio Transcript:\n[Audio conversion and transcription are not supported for Buffer inputs.]"; } + return { title: null, text_content: mdContent.trim() diff --git a/src/converters/pdf.ts b/src/converters/pdf.ts index 4c7b416..11e174c 100644 --- a/src/converters/pdf.ts +++ b/src/converters/pdf.ts @@ -4,7 +4,7 @@ import { pdfToText } from "pdf-ts"; export class PdfConverter implements DocumentConverter { async convert( - localPath: string, + source: string | Buffer, options: ConverterOptions = {} ): Promise { const fileExtension = options.file_extension || ""; @@ -13,7 +13,7 @@ export class PdfConverter implements DocumentConverter { } try { - const pdfContent = fs.readFileSync(localPath); + const pdfContent = typeof source === "string" ? fs.readFileSync(source) : Buffer.from(source); return this._convert(pdfContent); } catch (error) { console.error("PDF Parsing Error:", error); diff --git a/src/converters/plain-text.ts b/src/converters/plain-text.ts index 350c11c..ffaf346 100644 --- a/src/converters/plain-text.ts +++ b/src/converters/plain-text.ts @@ -4,7 +4,7 @@ import * as mime from "mime-types"; import fs from "fs"; export class PlainTextConverter implements DocumentConverter { - async convert(local_path: string, options: ConverterOptions = {}): Promise { + async convert(source: string | Buffer, options: ConverterOptions = {}): Promise { const fileExtension = options.file_extension || ""; const contentType = mime.lookup(fileExtension); @@ -14,7 +14,12 @@ export class PlainTextConverter implements DocumentConverter { return null; } - const content = fs.readFileSync(local_path, { encoding: "utf-8" }); + let content: string; + if (typeof source === "string") { + content = fs.readFileSync(source, { encoding: "utf-8" }); + } else { + content = Buffer.from(source).toString("utf-8"); + } return { title: null, diff --git a/src/converters/wav.ts b/src/converters/wav.ts index 2e9a833..af59740 100644 --- a/src/converters/wav.ts +++ b/src/converters/wav.ts @@ -3,7 +3,7 @@ import { MediaConverter } from "./media"; export class WavConverter extends MediaConverter { async convert( - localPath: string, + source: string | Buffer, options: ConverterOptions = {} ): Promise { const fileExtension = options.file_extension || ""; @@ -11,43 +11,57 @@ export class WavConverter extends MediaConverter { return null; } try { - return this._convert(localPath, options); + return this._convert(source, options); } catch (error) { console.error("WAV Conversion Error:", error); return null; } } - private async _convert(localPath: string, _: ConverterOptions): Promise { + + private async _convert(source: string | Buffer, _: ConverterOptions): Promise { let mdContent = ""; - const metadata = await this._getMetadata(localPath); - if (metadata) { - for (const f of [ - "Title", - "Artist", - "Author", - "Band", - "Album", - "Genre", - "Track", - "DateTimeOriginal", - "CreateDate", - "Duration" - ]) { - if (metadata[f]) { - mdContent += `${f}: ${metadata[f]}\n`; + + if (typeof source === "string") { + const metadata = await this._getMetadata(source); + if (metadata) { + for (const f of [ + "Title", + "Artist", + "Author", + "Band", + "Album", + "Genre", + "Track", + "DateTimeOriginal", + "CreateDate", + "Duration" + ]) { + if (metadata[f]) { + mdContent += `${f}: ${metadata[f]}\n`; + } } } + } else { + console.warn( + "Metadata extraction is skipped for Buffer inputs as it requires a file path for exiftool." + ); } - try { - const transcript = await this._transcribeAudio(localPath); - mdContent += `\n\n### Audio Transcript:\n${ - transcript === "" ? "[No speech detected]" : transcript - }`; - } catch (error) { - console.error("Error loading speech recognition module:", error); - mdContent += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."; + if (typeof source === "string") { + try { + const transcript = await this._transcribeAudio(source); + mdContent += `\n\n### Audio Transcript:\n${ + transcript === "" ? "[No speech detected]" : transcript + }`; + } catch (error) { + console.error("Error loading speech recognition module:", error); + mdContent += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."; + } + } else { + mdContent += + "\n\n### Audio Transcript:\n[Audio transcription is not supported for Buffer inputs in this version.]"; } + return { title: null, text_content: mdContent.trim() diff --git a/src/converters/wikipedia.ts b/src/converters/wikipedia.ts index d9896f8..998bdb3 100644 --- a/src/converters/wikipedia.ts +++ b/src/converters/wikipedia.ts @@ -3,19 +3,26 @@ import { JSDOM } from "jsdom"; import { CustomTurnDown } from "../custom-turndown"; import { ConverterOptions, ConverterResult, DocumentConverter } from "../types"; +const WIKIPEDIA_REGEX = /^https?:\/\/[a-zA-Z]{2,3}\.wikipedia\.org\//; +const BODY_SELECTOR_QUERY = "div#mw-content-text"; +const TITLE_SELECTOR_QUERY = "span.mw-page-title-main"; + export class WikipediaConverter implements DocumentConverter { - async convert(localPath: string, options: ConverterOptions = {}): Promise { + async convert(source: string | Buffer, options: ConverterOptions = {}): Promise { const fileExtension = options.file_extension || ""; if (![".html", ".htm"].includes(fileExtension.toLowerCase())) { return null; } const url = options.url || ""; - if (!/^https?:\/\/[a-zA-Z]{2,3}\.wikipedia\.org\//.test(url)) { + if (!WIKIPEDIA_REGEX.test(url)) { return null; } try { - const htmlContent = fs.readFileSync(localPath, { encoding: "utf-8" }); + const htmlContent = + typeof source === "string" + ? fs.readFileSync(source, { encoding: "utf-8" }) + : source.toString("utf-8"); return this._convert(htmlContent); } catch (error) { console.error("Wikipedia Parsing Error:", error); @@ -31,8 +38,8 @@ export class WikipediaConverter implements DocumentConverter { script.remove(); }); - const bodyElm = doc.querySelector("div#mw-content-text"); - const titleElm = doc.querySelector("span.mw-page-title-main"); + const bodyElm = doc.querySelector(BODY_SELECTOR_QUERY); + const titleElm = doc.querySelector(TITLE_SELECTOR_QUERY); let webpageText = ""; let mainTitle = doc.title; diff --git a/src/converters/xlsx.ts b/src/converters/xlsx.ts index a3eeeae..3636dce 100644 --- a/src/converters/xlsx.ts +++ b/src/converters/xlsx.ts @@ -4,18 +4,23 @@ import * as fs from "fs"; import * as XLSX from "xlsx"; export class XlsxConverter extends HtmlConverter { - async convert(local_path: string, options: ConverterOptions): Promise { + async convert(source: string | Buffer, options: ConverterOptions): Promise { const extension = options.file_extension || ""; if (![".xlsx"].includes(extension.toLowerCase())) { return null; } try { - let exists = fs.existsSync(local_path); - if (!exists) { - throw new Error("File does'nt exists"); + let workbook: XLSX.WorkBook; + if (typeof source === "string") { + if (!fs.existsSync(source)) { + throw new Error("File does'nt exists"); + } + workbook = XLSX.readFile(source); + } else { + workbook = XLSX.read(source, { type: "buffer" }); } - let workbook = XLSX.readFile(local_path); + let mdContent = ""; for (const sheetName of workbook.SheetNames) { diff --git a/src/converters/xml-rss-atom.ts b/src/converters/xml-rss-atom.ts index 4f2a91d..9adfca2 100644 --- a/src/converters/xml-rss-atom.ts +++ b/src/converters/xml-rss-atom.ts @@ -5,14 +5,17 @@ import * as fs from "fs"; import { JSDOM } from "jsdom"; export class RSSConverter implements DocumentConverter { - async convert(localPath: string, options: ConverterOptions = {}): Promise { + async convert(source: string | Buffer, options: ConverterOptions = {}): Promise { const fileExtension = options.file_extension || ""; if (![".xml", ".rss", ".atom"].includes(fileExtension.toLowerCase())) { return null; } try { - const xmlString = fs.readFileSync(localPath, { encoding: "utf-8" }); + const xmlString = + typeof source === "string" + ? fs.readFileSync(source, { encoding: "utf-8" }) + : source.toString("utf-8"); const doc = new DOMParser().parseFromString(xmlString, "text/xml"); let result; diff --git a/src/converters/youtube.ts b/src/converters/youtube.ts index e9d5fbd..465dda7 100644 --- a/src/converters/youtube.ts +++ b/src/converters/youtube.ts @@ -5,7 +5,7 @@ import { ConverterOptions, ConverterResult, DocumentConverter } from "../types"; export class YouTubeConverter implements DocumentConverter { async convert( - localPath: string, + source: string | Buffer, options: ConverterOptions = {} ): Promise { const fileExtension = options.file_extension || ""; @@ -17,7 +17,10 @@ export class YouTubeConverter implements DocumentConverter { return null; } try { - const htmlContent = fs.readFileSync(localPath, { encoding: "utf-8" }); + const htmlContent = + typeof source === "string" + ? fs.readFileSync(source, { encoding: "utf-8" }) + : source.toString("utf-8"); return this._convert(htmlContent, url, options); } catch (error) { console.error("YouTube Parsing Error:", error); diff --git a/src/converters/zip.ts b/src/converters/zip.ts index a9c370d..dd79576 100644 --- a/src/converters/zip.ts +++ b/src/converters/zip.ts @@ -1,10 +1,12 @@ import { ConverterOptions, ConverterResult, DocumentConverter } from "../types"; -import * as fs from "fs/promises"; +import * as fs from "fs"; import * as path from "path"; +import { PassThrough } from "stream"; +import unzipper from "unzipper"; export class ZipConverter implements DocumentConverter { async convert( - localPath: string, + source: string | Buffer, options: ConverterOptions = {} ): Promise { const fileExtension = options.file_extension || ""; @@ -15,19 +17,10 @@ export class ZipConverter implements DocumentConverter { if (!parentConverters) { return { title: null, - text_content: `[ERROR] No converters available to process zip contents from: ${localPath}` + text_content: `[ERROR] No converters available to process zip contents from: ${source}` }; } - const extractedZipFolderName = `extracted_${path.basename(localPath).replace(".zip", "_zip")}`; - const newFolder = path.normalize(path.join(path.dirname(localPath), extractedZipFolderName)); - let mdContent = `Content from the zip file \`${path.basename(localPath)}\`:\n\n`; - if (!newFolder.startsWith(path.dirname(localPath))) { - return { - title: null, - text_content: `[ERROR] Invalid zip file path: ${localPath}` - }; - } let unzipper; try { unzipper = await import("unzipper").then((mod) => mod.default); @@ -37,38 +30,58 @@ export class ZipConverter implements DocumentConverter { ); return null; } + try { - await fs.mkdir(newFolder, { recursive: true }); - const zip = await unzipper.Open.file(localPath); - await zip.extract({ path: newFolder }); + const zipFileName = typeof source === "string" ? path.basename(source) : "archive.zip"; + let mdContent = `Content from the zip file \`${zipFileName}\`:\n\n`; + const mdResults: string[] = []; - const files = await this._walk(newFolder); - for (const { root, name } of files) { - const filePath = path.join(root, name); - const relativePath = path.relative(newFolder, filePath); - const fileExtension = path.extname(name); + const processEntry = async (entry: unzipper.Entry) => { + const relativePath = entry.path; + if (entry.type === "File") { + const entryExtension = path.extname(relativePath); + const entryBuffer = await entry.buffer(); - const fileOptions = { - ...options, - file_extension: fileExtension, - _parent_converters: parentConverters - }; + const fileOptions = { + ...options, + file_extension: entryExtension, + _parent_converters: parentConverters + }; - for (const converter of parentConverters) { - if (converter instanceof ZipConverter) { - continue; - } - const result = await converter.convert(filePath, fileOptions); - if (result) { - mdContent += `\n## File: ${relativePath}\n\n`; - mdContent += result.text_content + "\n\n"; - break; + for (const converter of parentConverters) { + if (converter instanceof ZipConverter) { + continue; + } + const result = await converter.convert(entryBuffer, fileOptions); + if (result) { + mdResults.push(`\n## File: ${relativePath}\n\n${result.text_content}\n\n`); + break; + } } + } else { + entry.autodrain(); } - } - if (options.cleanupExtracted !== false) { - await fs.rm(newFolder, { recursive: true, force: true }); - } + }; + + const inputStream = + typeof source === "string" ? fs.createReadStream(source) : new PassThrough().end(source); + + await new Promise((res, rej) => { + const parser = unzipper.Parse(); + + parser.on("entry", (entry: unzipper.Entry) => { + processEntry(entry).catch((err) => { + parser.destroy(err); + rej(err); + }); + }); + parser.on("finish", res); + parser.on("error", rej); + + inputStream.pipe(parser); + }); + + mdContent += mdResults.join(""); return { title: null, @@ -78,25 +91,13 @@ export class ZipConverter implements DocumentConverter { if (error.message.includes("invalid signature")) { return { title: null, - text_content: `[ERROR] Invalid or corrupted zip file: ${localPath}` + text_content: `[ERROR] Invalid or corrupted zip file: ${source}` }; } return { title: null, - text_content: `[ERROR] Failed to process zip file ${localPath}: ${String(error)}` + text_content: `[ERROR] Failed to process zip file ${source}: ${String(error)}` }; } } - private async _walk(dir: string): Promise<{ root: string; name: string }[]> { - let results: { root: string; name: string }[] = []; - const files = await fs.readdir(dir, { withFileTypes: true }); - for (const file of files) { - if (file.isDirectory()) { - results = results.concat(await this._walk(path.join(dir, file.name))); - } else { - results.push({ root: dir, name: file.name }); - } - } - return results; - } } diff --git a/src/markitdown.ts b/src/markitdown.ts index 08e1a12..7e2eb37 100644 --- a/src/markitdown.ts +++ b/src/markitdown.ts @@ -37,6 +37,9 @@ export class MarkItDown { this.register_converter(new ZipConverter()); } + /** + * Converts a source from a file path, URL, or Response object. + */ async convert( source: string | Response, options: ConverterOptions = {} @@ -55,6 +58,18 @@ export class MarkItDown { } } } + + /** + * Converts a source from an in-memory Buffer. + */ + async convertBuffer( + source: Buffer, + options: ConverterOptions & { file_extension: string } + ): Promise { + const extensions = new Set([options.file_extension]); + return this._convert(source, extensions, options); + } + private async convert_url( source: string, { fetch = globalThis.fetch, ...options }: ConverterOptions @@ -91,38 +106,26 @@ export class MarkItDown { extensions.add(path.extname(fname[1])); } - const url_ext = path.extname(new URL(response.url).pathname); - extensions.add(url_ext); - - const file = fname ? `/tmp/${fname?.[1]}` : "/tmp/temp"; - const temp_writeable = fs.createWriteStream(file); - - try { - if (response.body == null) { - throw new Error("Response body is empty"); - } + if (response.url) { + const url_ext = path.extname(new URL(response.url).pathname); + extensions.add(url_ext); + } - const reader = response.body.getReader(); - while (true) { - const { done, value } = await reader.read(); - if (done) break; - temp_writeable.write(value); - } + if (extensions.size === 0) { + throw new Error( + "Could not determine file type. Please provide a `file_extension` in the options." + ); + } - temp_writeable.end(); - return await this._convert(file, extensions, { - ...options, - url: response.url - }); - } catch (e) { - throw new Error(`Could not write to file: ${e}`); - } finally { - try { - temp_writeable.close(); - } catch (e) { - throw new Error(`Could not close file: ${e}`); - } + if (response.body == null) { + throw new Error("Response body is empty"); } + + const buffer = Buffer.from(await response.arrayBuffer()); + return await this._convert(buffer, extensions, { + ...options, + url: response.url + }); } private async convert_local(source: string, options: ConverterOptions): Promise { @@ -145,7 +148,7 @@ export class MarkItDown { } private async _convert( - source: string, + source: string | Buffer, extensions: Set, options: any = {} ): Promise { diff --git a/src/types.ts b/src/types.ts index 1566912..ee3e7cd 100644 --- a/src/types.ts +++ b/src/types.ts @@ -25,5 +25,5 @@ export type ConverterOptions = { type MammothOptions = Parameters[1]; export interface DocumentConverter { - convert(local_path: string, options: ConverterOptions): Promise; + convert(source: string | Buffer, options: ConverterOptions): Promise; } From a0a10494f7d91532d93091b3c15c58eb3999201b Mon Sep 17 00:00:00 2001 From: Vaibhav Raj Date: Tue, 9 Sep 2025 02:13:44 +0530 Subject: [PATCH 2/4] test: add tests for buffer and blob conversions --- test/index.test.ts | 46 ++++++++++++++++++++++++++++++++++++++++++++++ test/test.data.ts | 1 - 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/test/index.test.ts b/test/index.test.ts index aed14b3..0dfd64f 100644 --- a/test/index.test.ts +++ b/test/index.test.ts @@ -1,6 +1,7 @@ import { describe, it, expect } from "vitest"; import { MarkItDown } from "../src/markitdown"; import * as path from "path"; +import * as fs from "fs"; import isCi from "is-ci"; import { openai } from "@ai-sdk/openai"; import { @@ -257,4 +258,49 @@ describe("MarkItDown Tests", () => { } }); }); + + describe("Buffer Conversion", () => { + it("should correctly convert a .zip file passed as a buffer", async () => { + const zipFilePath = path.join(__dirname, "__files/test_files.zip"); + const buffer = fs.readFileSync(zipFilePath); + const markitdown = new MarkItDown(); + const result = await markitdown.convertBuffer(buffer, { + file_extension: ".zip" // NOTE: this is required for buffer conversions + }); + + expect(result).not.toBeNull(); + expect(result).not.toBeUndefined(); + const textContent = result?.text_content.replace("\\", ""); + + expect(textContent).toContain("File: test.docx"); + for (const testString of DOCX_TEST_STRINGS) { + expect(textContent).toContain(testString); + } + }); + }); + + describe("Blob Conversion", () => { + it("should correctly convert a file passed as a Blob via a Response", async () => { + const zipFilePath = path.join(__dirname, "__files/test_files.zip"); + const buffer = fs.readFileSync(zipFilePath); + const blob = new Blob([buffer]); + + const response = new Response(blob, { + headers: { + "Content-Type": "application/zip" + } + }); + + const markitdown = new MarkItDown(); + const result = await markitdown.convert(response); + + expect(result).not.toBeNull(); + expect(result).not.toBeUndefined(); + const textContent = result?.text_content.replace("\\", ""); + + for (const testString of DOCX_TEST_STRINGS) { + expect(textContent).toContain(testString); + } + }); + }); }); diff --git a/test/test.data.ts b/test/test.data.ts index 244b5d4..05c2145 100644 --- a/test/test.data.ts +++ b/test/test.data.ts @@ -27,7 +27,6 @@ export const YOUTUBE_TEST_STRINGS = [ "## AutoGen FULL Tutorial with Python (Step-By-Step)", "This is an intermediate tutorial for installing and using AutoGen locally", "PT15M4S", - "the model we&#39;re going to be using today is GPT 3.5 turbo" ]; export const IPYNB_TEST_STRINGS = [ From 3670ceebda3082457b771cef4ec40264c8f96cb8 Mon Sep 17 00:00:00 2001 From: Vaibhav Raj Date: Tue, 9 Sep 2025 02:13:55 +0530 Subject: [PATCH 3/4] chore: update readme --- README.md | 104 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 94 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index a4c8583..f1fc841 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,9 @@ [![CI](https://github.com/dead8309/markitdown-ts/actions/workflows/ci.yml/badge.svg)](https://github.com/dead8309/markitdown/actions/workflows/ci.yml) -`markitdown-ts` is a TypeScript library designed for converting various file formats to Markdown. This makes it suitable for indexing, text analysis, and other applications that benefit from structured text. It is a TypeScript implementation of the original `markitdown` [Python library.](https://github.com/microsoft/markitdown) +`markitdown-ts` is a TypeScript library designed for converting various file formats to Markdown. It can process fiiles from local paths, URLs, or directly from in-memory buffers, making it ideal for serverless and edge environments like Supabase Functions or Cloudflare Workers. + +It is a TypeScript implementation of the original `markitdown` [Python library.](https://github.com/microsoft/markitdown) and is suitable for indexing, text analysis, and other applications that benefit from structured text. It supports: @@ -32,12 +34,21 @@ pnpm add markitdown-ts ## Usage +### Basic Usage (from a File Path) + +The simplest way to use the library is by providing a local file path or a URL. + ```typescript import { MarkItDown } from "markitdown-ts"; const markitdown = new MarkItDown(); try { + // Convert a local file const result = await markitdown.convert("path/to/your/file.pdf"); + + // Or convert from a URL + const result = await markitdown.convert("https://arxiv.org/pdf/2308.08155v2.pdf"); + if (result) { console.log(result.text_content); } @@ -46,7 +57,57 @@ try { } ``` -Pass additional options as needed for specific functionality. +### Advanced Usage (from Buffers, Blobs, or Responses) + +For use in serverless environments where you can't rely on a persistent filesystem, you can convert data directly from memory. + +> [!IMPORTANT] +> +> This is the recommended approach for environments like **Supabase Edge Functions**, **Cloudflare Workers**, or **AWS Lambda**. + +#### From a Buffer + +If you have your file content in a `Buffer`, use the `convertBuffer` method. You **must** provide the `file_extension` in the options so the library knows which converter to use. + +```typescript +import { MarkItDown } from "markitdown-ts"; +import * as fs from "fs"; + +const markitdown = new MarkItDown(); +try { + const buffer = fs.readFileSync("path/to/your/file.docx"); + const result = await markitdown.convertBuffer(buffer, { + file_extension: ".docx" + }); + console.log(result?.text_content); +} catch (error) { + console.error("Conversion failed:", error); +} +``` + +#### From a Response or Blob + +You can pass a standard `Response` object directly to the `convert` method. This is perfect for handling file uploads from a request body. + +```typescript +import { MarkItDown } from "markitdown-ts"; + +const markitdown = new MarkItDown(); + +// Example: Simulating a file upload by creating a Blob and a Response +const buffer = fs.readFileSync("path/to/archive.zip"); +const blob = new Blob([buffer]); +const response = new Response(blob, { + headers: { "Content-Type": "application/zip" } +}); + +try { + const result = await markitdown.convert(response); + console.log(result?.text_content); +} catch (error) { + console.error("Conversion failed:", error); +} +``` ## YouTube Transcript Support @@ -76,11 +137,22 @@ const result = await markitdown.convert("test.jpg", { ## API -The library uses a single function `convert` for all conversions, with the options and the response type defined as such: +The library exposes a `MarkItDown` class with two primary conversion methods. ```typescript -export interface DocumentConverter { - convert(local_path: string, options: ConverterOptions): Promise; +class MarkItDown { + /** + * Converts a source from a file path, URL, or Response object. + */ + async convert(source: string | Response, options?: ConverterOptions): Promise; + + /** + * Converts a source from an in-memory Buffer. + */ + async convertBuffer( + source: Buffer, + options: ConverterOptions & { file_extension: string } + ): Promise; } export type ConverterResult = @@ -92,16 +164,28 @@ export type ConverterResult = | undefined; export type ConverterOption = { + // Required when using convertBuffer file_extension?: string; + + // For URL-based converters (e.g., Wikipedia, Bing SERP) url?: string; + + // Provide a custom fetch implementation fetch?: typeof fetch; - enableYoutubeTranscript?: boolean; // false by default - youtubeTranscriptLanguage?: string; // "en" by default - llmModel: string; + + // YouTube-specific options + enableYoutubeTranscript?: boolean; // Default: false + youtubeTranscriptLanguage?: string; // Default: "en" + + // Image-specific LLM options + llmModel?: LanguageModel; llmPrompt?: string; + + // Options for .docx conversion (passed to mammoth.js) styleMap?: string | Array; - _parent_converters?: DocumentConverter[]; - cleanup_extracted?: boolean; + + // Options for .zip conversion + cleanupExtracted?: boolean; // Default: true }; ``` From 9628c006b19c58710e4af9645703c2512354e520 Mon Sep 17 00:00:00 2001 From: Vaibhav Raj Date: Tue, 9 Sep 2025 02:14:34 +0530 Subject: [PATCH 4/4] chore: format --- src/converters/media.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/converters/media.ts b/src/converters/media.ts index 57ff01c..bcb2ec5 100644 --- a/src/converters/media.ts +++ b/src/converters/media.ts @@ -5,7 +5,10 @@ import * as util from "util"; const exec = util.promisify(childProcess.exec); export abstract class MediaConverter implements DocumentConverter { - abstract convert(source: string | Buffer, options: ConverterOptions): Promise; + abstract convert( + source: string | Buffer, + options: ConverterOptions + ): Promise; async _getMetadata(local_path: string): Promise<{ [key: string]: string } | null> { const exiftool = await this._which("exiftool");