Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
a6e9c9e
matching the edges
Jason-Benson Jan 5, 2026
6dafa4c
more dependencies
Jason-Benson Jan 5, 2026
ee3769b
more @typescript-eslint dependencies
Jason-Benson Jan 5, 2026
8950899
switch undefinded to null per DK
Jason-Benson Jan 5, 2026
ffe00e3
version matching
Jason-Benson Jan 6, 2026
81efecb
rolled biullmq back to 4
Jason-Benson Jan 6, 2026
bf00abf
Merge remote-tracking branch 'upstream/dev' into depend6
Jason-Benson Jan 6, 2026
aecec96
Fix tablesort nonsense.
demiankatz Jan 6, 2026
26122a8
version matching
Jason-Benson Jan 6, 2026
f7cf790
Merge remote-tracking branch 'jason/depend6' into depend7
Jason-Benson Jan 6, 2026
1915ce5
reupdated express
Jason-Benson Jan 6, 2026
f76c0a5
Downgrade tablesort.
demiankatz Jan 6, 2026
52f84ca
matching the edges
Jason-Benson Jan 5, 2026
f0678e5
more dependencies
Jason-Benson Jan 5, 2026
0aea737
more @typescript-eslint dependencies
Jason-Benson Jan 5, 2026
b010f0e
switch undefinded to null per DK
Jason-Benson Jan 5, 2026
6693d10
version matching
Jason-Benson Jan 6, 2026
0116923
rolled biullmq back to 4
Jason-Benson Jan 6, 2026
60ff6f9
Fix tablesort nonsense.
demiankatz Jan 6, 2026
bd5515c
version matching
Jason-Benson Jan 6, 2026
38ab536
reupdated express
Jason-Benson Jan 6, 2026
698fb76
Downgrade tablesort.
demiankatz Jan 6, 2026
82b040a
Modified file ingest to work around node 2gb limit, but not done
Jason-Benson Jan 13, 2026
76c78a3
Commit from GitHub Actions (Lint Pull Requests)
github-actions[bot] Jan 13, 2026
5261c38
WiP
Jason-Benson Jan 13, 2026
eee9028
Deletes any preexisting metadata before attempting to write new data
Jason-Benson Jan 20, 2026
65f5602
Merge branch 'depend8' into 2gb_limit2
Jason-Benson Jan 20, 2026
d766da9
dependency matching
Jason-Benson Jan 20, 2026
8e0d752
cleaned up lint knitpicks
Jason-Benson Jan 20, 2026
4b432ae
removed unneeded logging
Jason-Benson Jan 20, 2026
c80aea4
Commit from GitHub Actions (Lint Pull Requests)
github-actions[bot] Jan 20, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 37 additions & 68 deletions api/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions api/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@
"locutus": "^2",
"morgan": "~1.10.1",
"mysql2": "^3.16",
"n3": "^2",
"nanoid": "^5.1.6",
"n3": "^1",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure why these dependencies have been downgraded here; I suspect this may be some kind of merge issue and not intended as part of the PR.

"nanoid": "^3.3.11",
"needle": "^3.1.0",
"node-tesseract-ocr": "^2",
"passport": "^0.7.0",
Expand Down
19 changes: 10 additions & 9 deletions api/src/jobs/Metadata.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import { Job as QueueJob } from "bullmq";
import fs = require("fs");
import tmp = require("tmp");
import Config from "../models/Config";
import { FedoraObject } from "../models/FedoraObject";
import FedoraObjectFactory from "../services/FedoraObjectFactory";
Expand All @@ -22,16 +21,18 @@ class MetadataProcessor {
}

async addMasterMetadataDatastream(): Promise<void> {
console.log(`Adding master metadata datastream to ${this.pid}`);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we still need this console.log?

const fedoraObject: FedoraObject = FedoraObject.build(this.pid, null, this.config);
const dataStream: Buffer = await fedoraObject.getDatastreamAsBuffer("MASTER");
const contentFile = tmp.fileSync();
fs.writeFileSync(contentFile.name, dataStream);
await fedoraObject.addMasterMetadataDatastream(contentFile.name);
fs.truncateSync(contentFile.name, 0);
fs.rmSync(contentFile.name);
console.log("FedoraObject.build: Done");
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about this one?

// Stream the MASTER datastream directly to a temporary file to avoid
// buffering very large files into memory, then run FITS on that file.
const contentPath = await fedoraObject.getDatastreamToTempFile("MASTER");
await fedoraObject.addMasterMetadataDatastream(contentPath);
fs.truncateSync(contentPath, 0);
fs.rmSync(contentPath);
// FITS XML will have been generated in /tmp as a side-effect; clean it up:
fs.truncateSync(contentFile.name + ".fits.xml", 0);
fs.rmSync(contentFile.name + ".fits.xml");
fs.truncateSync(contentPath + ".fits.xml", 0);
fs.rmSync(contentPath + ".fits.xml");
}

async run(): Promise<void> {
Expand Down
42 changes: 41 additions & 1 deletion api/src/models/FedoraObject.ts
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import Config from "./Config";
import { DatastreamParameters, Fedora } from "../services/Fedora";
import FedoraDataCollector from "../services/FedoraDataCollector";
import { execSync } from "child_process";
import crypto = require("crypto");
import { Agent } from "../services/interfaces";

export interface ObjectParameters {
Expand Down Expand Up @@ -65,6 +66,7 @@ export class FedoraObject {
params.logMessage ?? "Adding datastream " + id + " to " + this.pid + " with " + data.length + " bytes",
);
await this.fedora.addDatastream(this.pid, id, params, data, expectedStatus);
console.log(`Added datastream ${id} to ${this.pid}`);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it safe to remove all the console.log messages from this file as well? If you want to keep any of these messages in the long term, it would be better in this file to call this.log() which will send the message to the job processing log instead of to the console.

}

async deleteDatastream(stream: string): Promise<void> {
Expand All @@ -77,7 +79,26 @@ export class FedoraObject {
}

async addDatastreamFromFile(filename: string, stream: string, mimeType: string): Promise<void> {
await this.addDatastreamFromStringOrBuffer(fs.readFileSync(filename), stream, mimeType, [201]);
// Compute digest by streaming the file once (avoids loading the whole file into memory)
const md5Hash = crypto.createHash("md5");
await new Promise<void>((resolve, reject) => {
const rs = fs.createReadStream(filename);
rs.on("data", (chunk: Buffer) => {
md5Hash.update(chunk);
});
rs.on("end", () => resolve());
rs.on("error", (err) => reject(err));
});
const md5 = md5Hash.digest("hex");
const digestHeader = `md5=${md5}`;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By default, we used to provide md5 and sha512 hashes, but you're only precomputing the md5. Is that intentional or an oversight?


// Create a fresh read stream for the upload
const readStream = fs.createReadStream(filename);
const params: DatastreamParameters = {
mimeType: mimeType,
logMessage: "Initial Ingest addDatastream - " + stream,
};
await this.fedora.addDatastream(this.pid, stream, params, readStream, [201], digestHeader);
}

async updateDatastreamFromFile(filename: string, stream: string, mimeType: string): Promise<void> {
Expand Down Expand Up @@ -105,7 +126,21 @@ export class FedoraObject {
mimeType: "text/xml",
logMessage: "Initial Ingest addDatastream - MASTER-MD",
};
console.log("Getting fits MasterMetadata for file:", filename);
const fitsXml = this.fitsMasterMetadata(filename);

// Check if MASTER-MD exists and delete it if it does
try {
const checkResponse = await this.fedora.getDatastream(this.pid, "MASTER-MD");
if (checkResponse.statusCode === 200) {
console.log("Deleting pre-existing MASTER-MD");
await this.deleteDatastream("MASTER-MD");
}
} catch (e) {
console.log("No existing MASTER-MD to delete:", e.message);
}

console.log("Adding MASTER-MD datastream");
await this.addDatastream("MASTER-MD", params, fitsXml, [201, 204]);
}

Expand Down Expand Up @@ -221,11 +256,16 @@ export class FedoraObject {
return this.fedora.getDatastreamAsBuffer(this.pid, datastream);
}

async getDatastreamToTempFile(datastream: string, treatMissingAsEmpty = false): Promise<string> {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any particular reason the method is called getDatastreamToTempFile here but downloadDatastreamToTempFile in the Fedora class? I'd be inclined to use the "download" name in both places for consistency, though you can persuade me otherwise if you kept them separate for a specific reason. :-)

return this.fedora.downloadDatastreamToTempFile(this.pid, datastream, treatMissingAsEmpty);
}

async getDatastreamMetadata(datastream: string): Promise<string> {
return await this.fedora.getRdf(`${this.pid}/${datastream}/fcr:metadata`);
}

fitsMasterMetadata(filename: string): string {
console.log("Generating FITS metadata for " + filename);
const targetXml = filename + ".fits.xml";
if (!fs.existsSync(targetXml)) {
const fitsCommand = this.config.fitsCommand + " -i " + filename + " -o " + targetXml;
Expand Down
Loading
Loading