diff --git a/src/appmixer/utils/converters/PDF2Text/PDF2Text.js b/src/appmixer/utils/converters/PDF2Text/PDF2Text.js new file mode 100644 index 000000000..2df60303b --- /dev/null +++ b/src/appmixer/utils/converters/PDF2Text/PDF2Text.js @@ -0,0 +1,21 @@ +'use strict'; +const converters = require('../converters'); +const path = require('path'); +const { Readable } = require('stream'); + +module.exports = { + + async receive(context) { + + const { fileId, outputTextContent } = context.messages.in.content; + const fileInfo = await context.getFileInfo(fileId); + const newFileName = path.parse(fileInfo.filename).name + '.txt'; + const textContent = await converters.pdfToText(context, fileId); + const savedFile = await context.saveFileStream(newFileName, Readable.from(textContent)); + const out = { ...savedFile }; + if (outputTextContent) { + out.textContent = (await context.loadFile(savedFile.fileId)).toString('utf8'); + } + return context.sendJson(out, 'out'); + } +}; diff --git a/src/appmixer/utils/converters/PDF2Text/component.json b/src/appmixer/utils/converters/PDF2Text/component.json new file mode 100644 index 000000000..3d98fbe3d --- /dev/null +++ b/src/appmixer/utils/converters/PDF2Text/component.json @@ -0,0 +1,49 @@ +{ + "name": "appmixer.utils.converters.PDF2Text", + "author": "Appmixer ", + "description": "Extract text from a PDF file.", + "private": false, + "inPorts": [ + { + "name": "in", + "schema": { + "properties": { + "fileId": { "type": "string" }, + "outputTextContent": { "type": "boolean" } + }, + "required": [ "fileId" ] + }, + "inspector": { + "inputs": { + "fileId": { + "type": "filepicker", + "label": "File ID", + "index": 1, + "tooltip": "The ID of the PDF file to convert." + }, + "outputTextContent": { + "type": "toggle", + "defaultValue": false, + "label": "Output Text Content", + "index": 2, + "tooltip": "By default, the actual PDF text content is not returned, allowing very large files to be processed without working with the entire file data directly. For convenience, if your PDF files are smaller than 10 MB and you want to work with the pdf text directly, you can set this option. (Note that you can use the 'Files.Load File' utility to read any file data when needed)." + } + } + } + } + ], + "outPorts": [ + { + "name": "out", + "options": [ + { "label": "File ID", "value": "fileId" }, + { "label": "File Name", "value": "fileName" }, + { "label": "File Size (Bytes)", "value": "length" }, + { "label": "File Upload Date", "value": "uploadDate" }, + { "label": "File MD5", "value": "md5" }, + { "label": "Text Content", "value": "textContent" } + ] + } + ], + "icon": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAQAAAAEACAQAAAD2e2DtAAAHyklEQVR42u3dX2hWdRzH8c+GCDq1YVPbRRFqcyiFlnqVbDlZkloMmrWyqGtJGDHyQhdRwZZU5DRcdZHKpJQCbyzTsAJJU0fqLsz/0Y1TIZ2VTN0Ww4yp27Pz73me8/v+3p/vnTd7zvm+POc83/M7z5EIIYQQQgghhBBCCCEkmYxRuSpUa6yWaaXWaI1Watl//1KpUpo9MA9ouXbpivo8qg41qJjWF+oVHfKq8QOrU3V+t/8pHfG2+beq2dfml2in982/We/62P6HdZrW/1/1vrX/Gc8u+IarHj3nU/vnqZum31HXVO1L+x/UeRo+SHXpUR/aP1ZHafYQdV4P2QewgUZnqOOaYLv903SdNmesX1RkGcDXtHjY2q2RVts/V700OEBtVIFNAC00N2A12QRwhtb6PBucRVtDVK9esgZgNW0NVd2abwtAG00NWZc10xKAH2ip37PB32hohDqpSVYAdNHOSLXfymww+CYvdXxLtzIbBECy1WZhNggAz2eDAPB8NggAz2eDAPB83SAAPF83CADPZ4MA8Hw2CICk6oDGAMAGgCs+zQYBcHft0Dl/ZoMAuLu2ak7ko0ATACwAkKoiPylZDwALAKQXIy6Xd2w2CIChAEgrfZgNAmBoANKH9meDAMgEoECbrM8GAZAJgDRS39meDQIgMwBpnNotzwYBMBwAaaKO250NAmB4ANIUu7NBAAQBILuzQQAEA2B2NgiAoACMzgYBEByAydkgAMIAiDMbnAUACwDMzQYBEA6AudkgAMICMDYbBEB4AKZmgwCIAsDQbBAAnv/eIADyUY0A8BtAn94BgN8A+rQCAH4D6NEjAPAZQJ9OA8BvAH2qAoDfAPYAwG8AfwLAbwA9AMhdvkzlT84CIGf5FAB+A1gFAL8BVAHAbwAjUvl2ZAB4fhIAQA5TpBMA8BmANFt/AcBnAP2XgpcA4DMAqVx7AeAzgP6HO2q0M/KjngAwkiLNVo1qs1AfAMDvLAUAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAqUxt4H3aCwCLeZJXxvidaYH36WEAWEyhLgbcpxsAYDOtAffpAgBYPQlcD7BHj6gQAFazNsAr4yrS8EEBkJ2M0oFh9mdjOj4oALKVkowEmtPyMQGQzaNAi24Msic7VZeeDwmA7GaGWm97W1mHGlScpg8IgFzMBcq1SLWqVGn6PhwAPA8AAAAAAAAAAAAImZsvnarRbBUBwK/c+dq5bu1UjQoA4EeGevHkXpUDwH4yvXr2kqoBYDvDvXz6qhYCwPJl3/Cvn3eOAACCZ1Wg/dStJQCwmBG33dQxcxQAQPDLv+D7yiECAEj2BOAcAQAEzWehADhzLQCAoGkLCcCRowAAgmZ9aABOEABA0CyPAMCBEwEAgmZ6JACpPwoAIHgOWiQAgOB5OiKAVBMAQJh8EZlAaq8FABAm96jd2lEAAOEyUR22CAAgbCboiKUTAQA8JwAAzwkAwHMCAPCcAAA8JwAAzwlYBVCiF9SsLdqa5dqe4TkBJwhYBDBVbboWuS25rBt6FgBJZ4WuOtH8Wz8XXQ+AJPO+Q82/VfUASCqvOdj+/qNANQCSSJlTB/+BdSV/PxprCcAWR9vfX00AiJvxA36ywb26AIC4qXO4/f01HgDx8p7jAF4FQLxschzA2wCIl42OA3gLAJwCAMBFIAD4GggABkEAYBQMAG4GASB8uB3sOQDXFoRc17L87i6WhOWvevS9SvK9s1gUmsv6Vu06oT90Urv1usakYUexLNzzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAE4DKNM67dNZ/a5j2qftKRsJf65Vmq+RAMhOqnTGiTuB57RaowGQdL5yai3AaT0GgCRz0LnlIP/oCQAklW1OLgnrUhkAkki1s2uCf87fy+ctATjr8LLwGgDEzRSnnwv6BgBx0+I0gGsaBYB42e/4s4FzARAvpxwHsAQA8XLScQCLAcApAABcBAKAr4EAYBAEAEbBAOBmEADC5xC3g/0GUKAfWRDiM4B+Aq78ZCRLwrJGYG2MtpxiUaj7q4LjEfgkf9fkAIAAACAAAAgAAAK5SZdD35k79ZMaNRkCSeaYc5OzHm3WfRBIKnucnJ53al4oAutj/K11tgm0OXr/7G89njMCTZYBrHb2FmqnJuXsRPC8XQAzHV5EsTnk5WD0o8AFjbNL4IyzAHo0NWcE3rAL4COHjwFvhp4LRCXQYRfAHPU6C2BPDkdDk+wS2ObwhWCU6WC0o0ClXQBlTrxeYfDV9MoZgRoZzseOAriUw3sE1ZYBjNVRJwEcjnGbKOxRYJrtkfBkXXAQwIZYdwrDELiYvzf75SqVDl4JLIh5szg4gVZ5kMVO3RzuU3vs2zRBrwVuaLq8yAyHnrLvUUUCWxzsKLBW3uRe7XAEQGNCWzw8gQP5enQ7X1moX1Pf/uYEt7dAazK2v0TepVAv62BqR8Sdqkt8i+vUOei5v8W3//0Dc7+Wa5cup6r5HWpQcVa2tlgN6hjwl86rVTNEJI3WNFWoNu9VqdKsb2upKlWrRSq3/72fEEIIIYQQQgghhBDiSv4FeV1otCGtp+0AAAAASUVORK5CYII=" +} diff --git a/src/appmixer/utils/converters/bundle.json b/src/appmixer/utils/converters/bundle.json index 6801250c4..d08afa082 100644 --- a/src/appmixer/utils/converters/bundle.json +++ b/src/appmixer/utils/converters/bundle.json @@ -1,6 +1,6 @@ { "name": "appmixer.utils.converters", - "version": "1.3.2", + "version": "1.4.0", "changelog": { "1.1.0": [ "FileId inputs changed from 'text' to 'filepicker'. Requires Appmixer 4.4.4 or higher." @@ -17,6 +17,9 @@ "1.3.2": [ "improve CSV2HTML, CSV2JSON and JSON2CSV, JSON2HTML converters to use streams.", "JSON2HTML, CSV2HTML: added support for custom HTML renderers." + ], + "1.4.0": [ + "PDF2Text: new component to extract text from a PDF file." ] } } diff --git a/src/appmixer/utils/converters/converters.js b/src/appmixer/utils/converters/converters.js index 8c82cdcb4..1c2c718c1 100644 --- a/src/appmixer/utils/converters/converters.js +++ b/src/appmixer/utils/converters/converters.js @@ -1,6 +1,8 @@ const { pipeline } = require('stream'); const customHtmlTransforms = require('./htmlTransforms'); const JSONStream = require('JSONStream'); +const { arrayBuffer } = require('node:stream/consumers'); +let pdfjslib; const { getCSVReadStream, csvToJsonTransform, @@ -95,5 +97,26 @@ module.exports = { .then(resolve) .catch(reject); }); + }, + + pdfToText: async function(context, sourceFileId) { + + if (!pdfjslib) { + pdfjslib = await import('pdfjs-dist/legacy/build/pdf.mjs'); + } + const readStream = await context.getFileReadStream(sourceFileId); + let text = ''; + // Unfortunately, `data` can only be a string, buffer or TypedArray. + // getDocument() does not work over streams. + const loadingTask = pdfjslib.getDocument({ data: await arrayBuffer(readStream) }); + const pdfDoc = await loadingTask.promise; + for (let i = 0; i < pdfDoc.numPages; i++) { + const page = await pdfDoc.getPage(i + 1); + const textContent = await page.getTextContent(); + if (textContent.items.length) { + text += textContent.items.map(item => item.str || '').join(' '); + } + } + return text; } }; diff --git a/src/appmixer/utils/converters/package-lock.json b/src/appmixer/utils/converters/package-lock.json index 646c56ce5..e5b243b00 100644 --- a/src/appmixer/utils/converters/package-lock.json +++ b/src/appmixer/utils/converters/package-lock.json @@ -10,11 +10,185 @@ "dependencies": { "archiver": "5.3.1", "bluebird": "3.7.2", + "csv-parse": "5.5.6", + "JSONStream": "1.3.5", + "pdfjs-dist": "4.10.38", "tar": "6.1.12", "unzipper": "0.10.11", "xlsx": "0.18.5" } }, + "node_modules/@napi-rs/canvas": { + "version": "0.1.65", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.65.tgz", + "integrity": "sha512-YcFhXQcp+b2d38zFOJNbpyPHnIL7KAEkhJQ+UeeKI5IpE9B8Cpf/M6RiHPQXSsSqnYbrfFylnW49dyh2oeSblQ==", + "optional": true, + "engines": { + "node": ">= 10" + }, + "optionalDependencies": { + "@napi-rs/canvas-android-arm64": "0.1.65", + "@napi-rs/canvas-darwin-arm64": "0.1.65", + "@napi-rs/canvas-darwin-x64": "0.1.65", + "@napi-rs/canvas-linux-arm-gnueabihf": "0.1.65", + "@napi-rs/canvas-linux-arm64-gnu": "0.1.65", + "@napi-rs/canvas-linux-arm64-musl": "0.1.65", + "@napi-rs/canvas-linux-riscv64-gnu": "0.1.65", + "@napi-rs/canvas-linux-x64-gnu": "0.1.65", + "@napi-rs/canvas-linux-x64-musl": "0.1.65", + "@napi-rs/canvas-win32-x64-msvc": "0.1.65" + } + }, + "node_modules/@napi-rs/canvas-android-arm64": { + "version": "0.1.65", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.65.tgz", + "integrity": "sha512-ZYwqFYEKcT5Zr8lbiaJNJj/poLaeK2TncolY914r+gD2TJNeP7ZqvE7A2SX/1C9MB4E3DQEwm3YhL3WEf0x3MQ==", + "cpu": [ + "arm64" + ], + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-darwin-arm64": { + "version": "0.1.65", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.65.tgz", + "integrity": "sha512-Pg1pfiJEyDIsX+V0QaJPRWvXbw5zmWAk3bivFCvt/5pwZb37/sT6E/RqPHT9NnqpDyKW6SriwY9ypjljysUA1Q==", + "cpu": [ + "arm64" + ], + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-darwin-x64": { + "version": "0.1.65", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.65.tgz", + "integrity": "sha512-3Tr+/HjdJN7Z/VKIcsxV2DvDIibZCExgfYTgljCkUSFuoI7iNkOE6Dc1Q6j212EB9PeO8KmfrViBqHYT6IwWkA==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-linux-arm-gnueabihf": { + "version": "0.1.65", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.65.tgz", + "integrity": "sha512-3KP+dYObH7CVkZMZWwk1WX9jRjL+EKdQtD43H8MOI+illf+dwqLlecdQ4d9bQRIxELKJ8dyPWY4fOp/Ngufrdg==", + "cpu": [ + "arm" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-linux-arm64-gnu": { + "version": "0.1.65", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.65.tgz", + "integrity": "sha512-Ka3StKz7Dq7kjTF3nNJCq43UN/VlANS7qGE3dWkn1d+tQNsCRy/wRmyt1TUFzIjRqcTFMQNRbgYq84+53UBA0A==", + "cpu": [ + "arm64" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-linux-arm64-musl": { + "version": "0.1.65", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.65.tgz", + "integrity": "sha512-O4xMASm2JrmqYoiDyxVWi+z5C14H+oVEag2rZ5iIA67dhWqYZB+iO7wCFpBYRj31JPBR29FOsu6X9zL+DwBFdw==", + "cpu": [ + "arm64" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-linux-riscv64-gnu": { + "version": "0.1.65", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.65.tgz", + "integrity": "sha512-dblWDaA59ZU8bPbkfM+riSke7sFbNZ70LEevUdI5rgiFEUzYUQlU34gSBzemTACj5rCWt1BYeu0GfkLSjNMBSw==", + "cpu": [ + "riscv64" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-linux-x64-gnu": { + "version": "0.1.65", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.65.tgz", + "integrity": "sha512-wsp+atutw13OJXGU3DDkdngtBDoEg01IuK5xMe0L6VFPV8maGkh17CXze078OD5QJOc6kFyw3DDscMLOPF8+oA==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-linux-x64-musl": { + "version": "0.1.65", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.65.tgz", + "integrity": "sha512-odX+nN+IozWzhdj31INcHz3Iy9+EckNw+VqsZcaUxZOTu7/3FmktRNI6aC1qe5minZNv1m05YOS1FVf7fvmjlA==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + } + }, + "node_modules/@napi-rs/canvas-win32-x64-msvc": { + "version": "0.1.65", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.65.tgz", + "integrity": "sha512-RZQX3luWnlNWgdMnLMQ1hyfQraeAn9lnxWWVCHuUM4tAWEV8UDdeb7cMwmJW7eyt8kAosmjeHt3cylQMHOxGFg==", + "cpu": [ + "x64" + ], + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + } + }, "node_modules/adler-32": { "version": "1.3.1", "resolved": "https://registry.npmjs.org/adler-32/-/adler-32-1.3.1.tgz", @@ -293,6 +467,11 @@ "node": ">= 10" } }, + "node_modules/csv-parse": { + "version": "5.5.6", + "resolved": "https://registry.npmjs.org/csv-parse/-/csv-parse-5.5.6.tgz", + "integrity": "sha512-uNpm30m/AGSkLxxy7d9yRXpJQFrZzVWLFBkS+6ngPcZkw/5k3L/jjFuj7tVnEpRn+QgmiXr21nDlhCiUK4ij2A==" + }, "node_modules/duplexer2": { "version": "0.1.4", "resolved": "https://registry.npmjs.org/duplexer2/-/duplexer2-0.1.4.tgz", @@ -452,6 +631,29 @@ "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==" }, + "node_modules/jsonparse": { + "version": "1.3.1", + "resolved": "https://registry.npmjs.org/jsonparse/-/jsonparse-1.3.1.tgz", + "integrity": "sha512-POQXvpdL69+CluYsillJ7SUhKvytYjW9vG/GKpnf+xP8UWgYEM/RaMzHHofbALDiKbbP1W8UEYmgGl39WkPZsg==", + "engines": [ + "node >= 0.2.0" + ] + }, + "node_modules/JSONStream": { + "version": "1.3.5", + "resolved": "https://registry.npmjs.org/JSONStream/-/JSONStream-1.3.5.tgz", + "integrity": "sha512-E+iruNOY8VV9s4JEbe1aNEm6MiszPRr/UfcHMz0TQh1BXSxHK+ASV1R6W4HpjBhSeS+54PIsAMCBmwD06LLsqQ==", + "dependencies": { + "jsonparse": "^1.2.0", + "through": ">=2.2.7 <3" + }, + "bin": { + "JSONStream": "bin.js" + }, + "engines": { + "node": "*" + } + }, "node_modules/lazystream": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/lazystream/-/lazystream-1.0.1.tgz", @@ -597,6 +799,17 @@ "node": ">=0.10.0" } }, + "node_modules/pdfjs-dist": { + "version": "4.10.38", + "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-4.10.38.tgz", + "integrity": "sha512-/Y3fcFrXEAsMjJXeL9J8+ZG9U01LbuWaYypvDW2ycW1jL269L3js3DVBjDJ0Up9Np1uqDXsDrRihHANhZOlwdQ==", + "engines": { + "node": ">=20" + }, + "optionalDependencies": { + "@napi-rs/canvas": "^0.1.65" + } + }, "node_modules/process-nextick-args": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", @@ -727,6 +940,11 @@ "node": ">=6" } }, + "node_modules/through": { + "version": "2.3.8", + "resolved": "https://registry.npmjs.org/through/-/through-2.3.8.tgz", + "integrity": "sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==" + }, "node_modules/traverse": { "version": "0.3.9", "resolved": "https://registry.npmjs.org/traverse/-/traverse-0.3.9.tgz", diff --git a/src/appmixer/utils/converters/package.json b/src/appmixer/utils/converters/package.json index a8d4c8b5f..28fb8b95d 100644 --- a/src/appmixer/utils/converters/package.json +++ b/src/appmixer/utils/converters/package.json @@ -7,6 +7,7 @@ "bluebird": "3.7.2", "csv-parse": "5.5.6", "JSONStream": "1.3.5", + "pdfjs-dist": "4.10.38", "tar": "6.1.12", "unzipper": "0.10.11", "xlsx": "0.18.5"