diff --git a/api/package-lock.json b/api/package-lock.json index 9959943f..9d83deaa 100644 --- a/api/package-lock.json +++ b/api/package-lock.json @@ -27,8 +27,8 @@ "locutus": "^2", "morgan": "~1.10.1", "mysql2": "^3.16", - "n3": "^2", - "nanoid": "^5.1.6", + "n3": "^1", + "nanoid": "^3.3.11", "needle": "^3.1.0", "node-tesseract-ocr": "^2", "passport": "^0.7.0", @@ -654,17 +654,17 @@ } }, "node_modules/@es-joy/jsdoccomment": { - "version": "0.78.0", - "resolved": "https://registry.npmjs.org/@es-joy/jsdoccomment/-/jsdoccomment-0.78.0.tgz", - "integrity": "sha512-rQkU5u8hNAq2NVRzHnIUUvR6arbO0b6AOlvpTNS48CkiKSn/xtNfOzBK23JE4SiW89DgvU7GtxLVgV4Vn2HBAw==", + "version": "0.76.0", + "resolved": "https://registry.npmjs.org/@es-joy/jsdoccomment/-/jsdoccomment-0.76.0.tgz", + "integrity": "sha512-g+RihtzFgGTx2WYCuTHbdOXJeAlGnROws0TeALx9ow/ZmOROOZkVg5wp/B44n0WJgI4SQFP1eWM2iRPlU2Y14w==", "dev": true, "license": "MIT", "dependencies": { "@types/estree": "^1.0.8", - "@typescript-eslint/types": "^8.46.4", + "@typescript-eslint/types": "^8.46.0", "comment-parser": "1.4.1", "esquery": "^1.6.0", - "jsdoc-type-pratt-parser": "~7.0.0" + "jsdoc-type-pratt-parser": "~6.10.0" }, "engines": { "node": ">=20.11.0" @@ -3640,9 +3640,9 @@ "license": "MIT" }, "node_modules/baseline-browser-mapping": { - "version": "2.9.13", - "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.13.tgz", - "integrity": "sha512-WhtvB2NG2wjr04+h77sg3klAIwrgOqnjS49GGudnUPGFFgg7G17y7Qecqp+2Dr5kUDxNRBca0SK7cG8JwzkWDQ==", + "version": "2.9.11", + "resolved": "https://registry.npmjs.org/baseline-browser-mapping/-/baseline-browser-mapping-2.9.11.tgz", + "integrity": "sha512-Sg0xJUNDU1sJNGdfGWhVHX0kkZ+HWcvmVymJbj6NSgZZmW/8S9Y2HQ5euytnIgakgxN6papOAWiwDo1ctFDcoQ==", "dev": true, "license": "Apache-2.0", "bin": { @@ -3739,9 +3739,9 @@ } }, "node_modules/body-parser": { - "version": "2.2.2", - "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-2.2.2.tgz", - "integrity": "sha512-oP5VkATKlNwcgvxi0vM0p/D3n2C3EReYVX+DNYs5TjZFn/oQt2j+4sVJtSMr18pdRr8wjTcBl6LoV+FUwzPmNA==", + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/body-parser/-/body-parser-2.2.1.tgz", + "integrity": "sha512-nfDwkulwiZYQIGwxdy0RUmowMhKcFVcYXUU7m4QlKYim1rUtg83xm2yjZ40QjDuc291AJjjeSc9b++AWHSgSHw==", "license": "MIT", "dependencies": { "bytes": "^3.1.2", @@ -3750,7 +3750,7 @@ "http-errors": "^2.0.0", "iconv-lite": "^0.7.0", "on-finished": "^2.4.1", - "qs": "^6.14.1", + "qs": "^6.14.0", "raw-body": "^3.0.1", "type-is": "^2.0.1" }, @@ -4404,9 +4404,9 @@ } }, "node_modules/caniuse-lite": { - "version": "1.0.30001763", - "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001763.tgz", - "integrity": "sha512-mh/dGtq56uN98LlNX9qdbKnzINhX0QzhiWBFEkFfsFO4QyCvL8YegrJAazCwXIeqkIob8BlZPGM3xdnY+sgmvQ==", + "version": "1.0.30001762", + "resolved": "https://registry.npmjs.org/caniuse-lite/-/caniuse-lite-1.0.30001762.tgz", + "integrity": "sha512-PxZwGNvH7Ak8WX5iXzoK1KPZttBXNPuaOvI2ZYU7NrlM+d9Ov+TUvlLOBNGzVXAntMSMMlJPd+jY6ovrVjSmUw==", "dev": true, "funding": [ { @@ -5300,20 +5300,20 @@ } }, "node_modules/eslint-plugin-jsdoc": { - "version": "61.7.1", - "resolved": "https://registry.npmjs.org/eslint-plugin-jsdoc/-/eslint-plugin-jsdoc-61.7.1.tgz", - "integrity": "sha512-36DpldF95MlTX//n3/naULFVt8d1cV4jmSkx7ZKrE9ikkKHAgMLesuWp1SmwpVwAs5ndIM6abKd6PeOYZUgdWg==", + "version": "61.5.0", + "resolved": "https://registry.npmjs.org/eslint-plugin-jsdoc/-/eslint-plugin-jsdoc-61.5.0.tgz", + "integrity": "sha512-PR81eOGq4S7diVnV9xzFSBE4CDENRQGP0Lckkek8AdHtbj+6Bm0cItwlFnxsLFriJHspiE3mpu8U20eODyToIg==", "dev": true, "license": "BSD-3-Clause", "dependencies": { - "@es-joy/jsdoccomment": "~0.78.0", + "@es-joy/jsdoccomment": "~0.76.0", "@es-joy/resolve.exports": "1.2.0", "are-docs-informative": "^0.0.2", "comment-parser": "1.4.1", "debug": "^4.4.3", "escape-string-regexp": "^4.0.0", - "espree": "^11.0.0", - "esquery": "^1.7.0", + "espree": "^10.4.0", + "esquery": "^1.6.0", "html-entities": "^2.6.0", "object-deep-merge": "^2.0.0", "parse-imports-exports": "^0.2.4", @@ -5328,37 +5328,6 @@ "eslint": "^7.0.0 || ^8.0.0 || ^9.0.0" } }, - "node_modules/eslint-plugin-jsdoc/node_modules/eslint-visitor-keys": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/eslint-visitor-keys/-/eslint-visitor-keys-5.0.0.tgz", - "integrity": "sha512-A0XeIi7CXU7nPlfHS9loMYEKxUaONu/hTEzHTGba9Huu94Cq1hPivf+DE5erJozZOky0LfvXAyrV/tcswpLI0Q==", - "dev": true, - "license": "Apache-2.0", - "engines": { - "node": "^20.19.0 || ^22.13.0 || >=24" - }, - "funding": { - "url": "https://opencollective.com/eslint" - } - }, - "node_modules/eslint-plugin-jsdoc/node_modules/espree": { - "version": "11.0.0", - "resolved": "https://registry.npmjs.org/espree/-/espree-11.0.0.tgz", - "integrity": "sha512-+gMeWRrIh/NsG+3NaLeWHuyeyk70p2tbvZIWBYcqQ4/7Xvars6GYTZNhF1sIeLcc6Wb11He5ffz3hsHyXFrw5A==", - "dev": true, - "license": "BSD-2-Clause", - "dependencies": { - "acorn": "^8.15.0", - "acorn-jsx": "^5.3.2", - "eslint-visitor-keys": "^5.0.0" - }, - "engines": { - "node": "^20.19.0 || ^22.13.0 || >=24" - }, - "funding": { - "url": "https://opencollective.com/eslint" - } - }, "node_modules/eslint-plugin-prettier": { "version": "5.5.4", "resolved": "https://registry.npmjs.org/eslint-plugin-prettier/-/eslint-plugin-prettier-5.5.4.tgz", @@ -6720,9 +6689,9 @@ } }, "node_modules/ioredis": { - "version": "5.9.1", - "resolved": "https://registry.npmjs.org/ioredis/-/ioredis-5.9.1.tgz", - "integrity": "sha512-BXNqFQ66oOsR82g9ajFFsR8ZKrjVvYCLyeML9IvSMAsP56XH2VXBdZjmI11p65nXXJxTEt1hie3J2QeFJVgrtQ==", + "version": "5.9.0", + "resolved": "https://registry.npmjs.org/ioredis/-/ioredis-5.9.0.tgz", + "integrity": "sha512-T3VieIilNumOJCXI9SDgo4NnF6sZkd6XcmPi6qWtw4xqbt8nNz/ZVNiIH1L9puMTSHZh1mUWA4xKa2nWPF4NwQ==", "license": "MIT", "dependencies": { "@ioredis/commands": "1.5.0", @@ -7707,9 +7676,9 @@ } }, "node_modules/jsdoc-type-pratt-parser": { - "version": "7.0.0", - "resolved": "https://registry.npmjs.org/jsdoc-type-pratt-parser/-/jsdoc-type-pratt-parser-7.0.0.tgz", - "integrity": "sha512-c7YbokssPOSHmqTbSAmTtnVgAVa/7lumWNYqomgd5KOMyPrRve2anx6lonfOsXEQacqF9FKVUj7bLg4vRSvdYA==", + "version": "6.10.0", + "resolved": "https://registry.npmjs.org/jsdoc-type-pratt-parser/-/jsdoc-type-pratt-parser-6.10.0.tgz", + "integrity": "sha512-+LexoTRyYui5iOhJGn13N9ZazL23nAHGkXsa1p/C8yeq79WRfLBag6ZZ0FQG2aRoc9yfo59JT9EYCQonOkHKkQ==", "dev": true, "license": "MIT", "engines": { @@ -8620,9 +8589,9 @@ } }, "node_modules/n3": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/n3/-/n3-2.0.0.tgz", - "integrity": "sha512-U16wgzeUvnP92QSw9q3CWpYcIMB7egm1krI79+a89HoIFf/5Qwvh1RGAV51+W9Dlwr9+1hb60JZlfPfUyzdHtw==", + "version": "1.26.0", + "resolved": "https://registry.npmjs.org/n3/-/n3-1.26.0.tgz", + "integrity": "sha512-SQknS0ua90rN+3RHuk8BeIqeYyqIH/+ecViZxX08jR4j6MugqWRjtONl3uANG/crWXnOM2WIqBJtjIhVYFha+w==", "license": "MIT", "dependencies": { "buffer": "^6.0.3", @@ -8645,9 +8614,9 @@ } }, "node_modules/nanoid": { - "version": "5.1.6", - "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-5.1.6.tgz", - "integrity": "sha512-c7+7RQ+dMB5dPwwCp4ee1/iV/q2P6aK1mTZcfr1BTuVlyW9hJYiMPybJCcnBlQtuSmTIWNeazm/zqNoZSSElBg==", + "version": "3.3.11", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz", + "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==", "funding": [ { "type": "github", @@ -8656,10 +8625,10 @@ ], "license": "MIT", "bin": { - "nanoid": "bin/nanoid.js" + "nanoid": "bin/nanoid.cjs" }, "engines": { - "node": "^18 || >=20" + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" } }, "node_modules/napi-build-utils": { diff --git a/api/package.json b/api/package.json index d90a7a0e..0b264de7 100644 --- a/api/package.json +++ b/api/package.json @@ -39,8 +39,8 @@ "locutus": "^2", "morgan": "~1.10.1", "mysql2": "^3.16", - "n3": "^2", - "nanoid": "^5.1.6", + "n3": "^1", + "nanoid": "^3.3.11", "needle": "^3.1.0", "node-tesseract-ocr": "^2", "passport": "^0.7.0", diff --git a/api/src/jobs/Metadata.ts b/api/src/jobs/Metadata.ts index 9f44b2aa..f55beb4e 100644 --- a/api/src/jobs/Metadata.ts +++ b/api/src/jobs/Metadata.ts @@ -1,6 +1,5 @@ import { Job as QueueJob } from "bullmq"; import fs = require("fs"); -import tmp = require("tmp"); import Config from "../models/Config"; import { FedoraObject } from "../models/FedoraObject"; import FedoraObjectFactory from "../services/FedoraObjectFactory"; @@ -22,16 +21,18 @@ class MetadataProcessor { } async addMasterMetadataDatastream(): Promise { + console.log(`Adding master metadata datastream to ${this.pid}`); const fedoraObject: FedoraObject = FedoraObject.build(this.pid, null, this.config); - const dataStream: Buffer = await fedoraObject.getDatastreamAsBuffer("MASTER"); - const contentFile = tmp.fileSync(); - fs.writeFileSync(contentFile.name, dataStream); - await fedoraObject.addMasterMetadataDatastream(contentFile.name); - fs.truncateSync(contentFile.name, 0); - fs.rmSync(contentFile.name); + console.log("FedoraObject.build: Done"); + // Stream the MASTER datastream directly to a temporary file to avoid + // buffering very large files into memory, then run FITS on that file. + const contentPath = await fedoraObject.getDatastreamToTempFile("MASTER"); + await fedoraObject.addMasterMetadataDatastream(contentPath); + fs.truncateSync(contentPath, 0); + fs.rmSync(contentPath); // FITS XML will have been generated in /tmp as a side-effect; clean it up: - fs.truncateSync(contentFile.name + ".fits.xml", 0); - fs.rmSync(contentFile.name + ".fits.xml"); + fs.truncateSync(contentPath + ".fits.xml", 0); + fs.rmSync(contentPath + ".fits.xml"); } async run(): Promise { diff --git a/api/src/models/FedoraObject.ts b/api/src/models/FedoraObject.ts old mode 100644 new mode 100755 index 684729b0..d65a81f0 --- a/api/src/models/FedoraObject.ts +++ b/api/src/models/FedoraObject.ts @@ -5,6 +5,7 @@ import Config from "./Config"; import { DatastreamParameters, Fedora } from "../services/Fedora"; import FedoraDataCollector from "../services/FedoraDataCollector"; import { execSync } from "child_process"; +import crypto = require("crypto"); import { Agent } from "../services/interfaces"; export interface ObjectParameters { @@ -65,6 +66,7 @@ export class FedoraObject { params.logMessage ?? "Adding datastream " + id + " to " + this.pid + " with " + data.length + " bytes", ); await this.fedora.addDatastream(this.pid, id, params, data, expectedStatus); + console.log(`Added datastream ${id} to ${this.pid}`); } async deleteDatastream(stream: string): Promise { @@ -77,7 +79,26 @@ export class FedoraObject { } async addDatastreamFromFile(filename: string, stream: string, mimeType: string): Promise { - await this.addDatastreamFromStringOrBuffer(fs.readFileSync(filename), stream, mimeType, [201]); + // Compute digest by streaming the file once (avoids loading the whole file into memory) + const md5Hash = crypto.createHash("md5"); + await new Promise((resolve, reject) => { + const rs = fs.createReadStream(filename); + rs.on("data", (chunk: Buffer) => { + md5Hash.update(chunk); + }); + rs.on("end", () => resolve()); + rs.on("error", (err) => reject(err)); + }); + const md5 = md5Hash.digest("hex"); + const digestHeader = `md5=${md5}`; + + // Create a fresh read stream for the upload + const readStream = fs.createReadStream(filename); + const params: DatastreamParameters = { + mimeType: mimeType, + logMessage: "Initial Ingest addDatastream - " + stream, + }; + await this.fedora.addDatastream(this.pid, stream, params, readStream, [201], digestHeader); } async updateDatastreamFromFile(filename: string, stream: string, mimeType: string): Promise { @@ -105,7 +126,21 @@ export class FedoraObject { mimeType: "text/xml", logMessage: "Initial Ingest addDatastream - MASTER-MD", }; + console.log("Getting fits MasterMetadata for file:", filename); const fitsXml = this.fitsMasterMetadata(filename); + + // Check if MASTER-MD exists and delete it if it does + try { + const checkResponse = await this.fedora.getDatastream(this.pid, "MASTER-MD"); + if (checkResponse.statusCode === 200) { + console.log("Deleting pre-existing MASTER-MD"); + await this.deleteDatastream("MASTER-MD"); + } + } catch (e) { + console.log("No existing MASTER-MD to delete:", e.message); + } + + console.log("Adding MASTER-MD datastream"); await this.addDatastream("MASTER-MD", params, fitsXml, [201, 204]); } @@ -221,11 +256,16 @@ export class FedoraObject { return this.fedora.getDatastreamAsBuffer(this.pid, datastream); } + async getDatastreamToTempFile(datastream: string, treatMissingAsEmpty = false): Promise { + return this.fedora.downloadDatastreamToTempFile(this.pid, datastream, treatMissingAsEmpty); + } + async getDatastreamMetadata(datastream: string): Promise { return await this.fedora.getRdf(`${this.pid}/${datastream}/fcr:metadata`); } fitsMasterMetadata(filename: string): string { + console.log("Generating FITS metadata for " + filename); const targetXml = filename + ".fits.xml"; if (!fs.existsSync(targetXml)) { const fitsCommand = this.config.fitsCommand + " -i " + filename + " -o " + targetXml; diff --git a/api/src/services/Fedora.ts b/api/src/services/Fedora.ts old mode 100644 new mode 100755 index b7cff740..08ced8db --- a/api/src/services/Fedora.ts +++ b/api/src/services/Fedora.ts @@ -9,6 +9,8 @@ import xmlescape = require("xml-escape"); import { HttpError } from "../models/HttpError"; import winston = require("winston"); import SolrCache from "./SolrCache"; +import fs = require("fs"); +import tmp = require("tmp"); export interface DatastreamParameters { dsLabel?: string; @@ -58,7 +60,7 @@ export class Fedora { protected _request( method = "get", _path = "/", - data: string | Buffer = null, + data: string | Buffer | NodeJS.ReadableStream = null, _options: Record = {}, ): Promise { const path = _path[0] == "/" ? _path.slice(1) : _path; @@ -70,7 +72,12 @@ export class Fedora { password: this.config.fedoraPassword, }; const options = Object.assign({}, auth, _options); - return http(method, url, data, options); + + return http(method, url, data, options).catch((err) => { + console.error(`Request failed for ${method.toUpperCase()} ${url}:`, err); + console.error("Full error:", JSON.stringify(err, Object.getOwnPropertyNames(err), 2)); + throw err; + }); } /** @@ -95,14 +102,18 @@ export class Fedora { } async getDatastreamAsBuffer(pid: string, datastream: string, treatMissingAsEmpty = false): Promise { + console.log("getDatastreamAsBuffer:Start"); const response = await this.getDatastream(pid, datastream); if (response.statusCode === 200) { + console.log("response.statusCode === 200"); return response.body; } if (response.statusCode === 404 && treatMissingAsEmpty) { + console.log("response.statusCode === 404"); return Buffer.from(""); } else { + console.log("Unexpected response for " + pid + "/" + datastream + ": " + response.statusCode); throw new Error("Unexpected response for " + pid + "/" + datastream + ": " + response.statusCode); } } @@ -193,6 +204,9 @@ export class Fedora { datastream: string, requestOptions = { parse_response: false }, ): Promise { + console.log("getDatastream:Start"); + console.log("pid: " + pid); + console.log("datastream: " + datastream); return await this._request( "get", pid + "/" + datastream, @@ -201,6 +215,69 @@ export class Fedora { ); } + /** + * Download a datastream directly to a temporary file to avoid buffering + * large files into memory. + * + * @param pid Record id + * @param datastream Which stream to request + * @param treatMissingAsEmpty If true, return empty temp file on 404 + */ + async downloadDatastreamToTempFile(pid: string, datastream: string, treatMissingAsEmpty = false): Promise { + const path = pid + "/" + datastream; + const urlPath = path[0] == "/" ? path.slice(1) : path; + const url = this.config.restBaseUrl + "/" + urlPath; + + const auth = { + username: this.config.fedoraUsername, + password: this.config.fedoraPassword, + }; + const options = Object.assign({}, auth); + + return new Promise((resolve, reject) => { + const tmpobj = tmp.fileSync(); + const writeStream = fs.createWriteStream(tmpobj.name); + + const req = http.get(url, options); + + req.on("response", (res) => { + if (res.statusCode === 200) { + req.pipe(writeStream); + writeStream.on("finish", () => { + resolve(tmpobj.name); + }); + writeStream.on("error", (err) => { + try { + fs.unlinkSync(tmpobj.name); + } catch (e) { + console.error(e); + } + reject(err); + }); + } else if (res.statusCode === 404 && treatMissingAsEmpty) { + // create empty file and return its path + writeStream.end(() => resolve(tmpobj.name)); + } else { + try { + fs.unlinkSync(tmpobj.name); + } catch (e) { + console.error(e); + } + reject(new Error("Unexpected response for " + pid + "/" + datastream + ": " + res.statusCode)); + } + }); + + req.on("error", (err) => { + try { + fs.unlinkSync(tmpobj.name); + } catch (e) { + console.error(e); + } + reject(err); + }); + }); + } + /** * Get DC datastream from Fedora * @@ -264,18 +341,30 @@ export class Fedora { stream: string, mimeType: string, expectedStatus = [201], - data: string | Buffer, + data: string | Buffer | NodeJS.ReadableStream, linkHeader = "", + precomputedDigest = "", ): Promise { this.cache.purgeFromCacheIfEnabled(pid); - const md5 = crypto.createHash("md5").update(data).digest("hex"); - const sha = crypto.createHash("sha512").update(data).digest("hex"); const headers: Record = { "Overwrite-Tombstone": "true", "Content-Disposition": 'attachment; filename="' + stream + '"', "Content-Type": mimeType, - Digest: "md5=" + md5 + ", sha-512=" + sha, }; + + // If caller supplied a precomputed digest (for streaming upload), use it. + if (precomputedDigest && precomputedDigest.length > 0) { + headers.Digest = precomputedDigest; + } else { + // For string/Buffer payloads, compute digests here. + if (typeof data === "string" || Buffer.isBuffer(data)) { + const md5 = crypto.createHash("md5").update(data).digest("hex"); + headers.Digest = "md5=" + md5; + } else { + // No precomputed digest and data is a stream — cannot compute here. + throw new Error("Streaming data requires a precomputed digest header to be provided"); + } + } const options = { headers: headers }; if (linkHeader.length > 0) { options.headers.Link = linkHeader; @@ -306,13 +395,22 @@ export class Fedora { pid: string, stream: string, params: DatastreamParameters, - data: string | Buffer, + data: string | Buffer | NodeJS.ReadableStream, expectedStatus = [201], + precomputedDigest = "", ): Promise { this.cache.purgeFromCacheIfEnabled(pid); // First create the stream: - await this.putDatastream(pid, stream, params.mimeType, expectedStatus, data, params.linkHeader ?? ""); + await this.putDatastream( + pid, + stream, + params.mimeType, + expectedStatus, + data, + params.linkHeader ?? "", + precomputedDigest, + ); // Now set appropriate metadata: const writer = new N3.Writer({ format: "text/turtle" }); diff --git a/api/src/services/TikaExtractor.ts b/api/src/services/TikaExtractor.ts index 4bb2b84a..ce28063e 100644 --- a/api/src/services/TikaExtractor.ts +++ b/api/src/services/TikaExtractor.ts @@ -34,6 +34,17 @@ class TikaExtractor { fs.rmSync(filename); // clean up temp file; we're done now! return result; } + + extractTextFromFile(filename: string): string { + const javaPath = this.config.javaPath; + const tikaPath = this.config.tikaPath; + const tikaConfig = this.config.tikaConfigFile ? `--config=${this.config.tikaConfigFile} ` : ""; + const tikaCommand = javaPath + " -jar " + tikaPath + " --text -eUTF8 " + tikaConfig + filename; + const result = execSync(tikaCommand, { maxBuffer: Infinity }).toString(); + fs.truncateSync(filename, 0); + fs.rmSync(filename); + return result; + } } export default TikaExtractor;