-
Notifications
You must be signed in to change notification settings - Fork 4
2gb upload limit fix #352
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
2gb upload limit fix #352
Changes from all commits
a6e9c9e
6dafa4c
ee3769b
8950899
ffe00e3
81efecb
bf00abf
aecec96
26122a8
f7cf790
1915ce5
f76c0a5
52f84ca
f0678e5
0aea737
b010f0e
6693d10
0116923
60ff6f9
bd5515c
38ab536
698fb76
82b040a
76c78a3
5261c38
eee9028
65f5602
d766da9
8e0d752
4b432ae
c80aea4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,5 @@ | ||
| import { Job as QueueJob } from "bullmq"; | ||
| import fs = require("fs"); | ||
| import tmp = require("tmp"); | ||
| import Config from "../models/Config"; | ||
| import { FedoraObject } from "../models/FedoraObject"; | ||
| import FedoraObjectFactory from "../services/FedoraObjectFactory"; | ||
|
|
@@ -22,16 +21,18 @@ class MetadataProcessor { | |
| } | ||
|
|
||
| async addMasterMetadataDatastream(): Promise<void> { | ||
| console.log(`Adding master metadata datastream to ${this.pid}`); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we still need this console.log? |
||
| const fedoraObject: FedoraObject = FedoraObject.build(this.pid, null, this.config); | ||
| const dataStream: Buffer = await fedoraObject.getDatastreamAsBuffer("MASTER"); | ||
| const contentFile = tmp.fileSync(); | ||
| fs.writeFileSync(contentFile.name, dataStream); | ||
| await fedoraObject.addMasterMetadataDatastream(contentFile.name); | ||
| fs.truncateSync(contentFile.name, 0); | ||
| fs.rmSync(contentFile.name); | ||
| console.log("FedoraObject.build: Done"); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about this one? |
||
| // Stream the MASTER datastream directly to a temporary file to avoid | ||
| // buffering very large files into memory, then run FITS on that file. | ||
| const contentPath = await fedoraObject.getDatastreamToTempFile("MASTER"); | ||
| await fedoraObject.addMasterMetadataDatastream(contentPath); | ||
| fs.truncateSync(contentPath, 0); | ||
| fs.rmSync(contentPath); | ||
| // FITS XML will have been generated in /tmp as a side-effect; clean it up: | ||
| fs.truncateSync(contentFile.name + ".fits.xml", 0); | ||
| fs.rmSync(contentFile.name + ".fits.xml"); | ||
| fs.truncateSync(contentPath + ".fits.xml", 0); | ||
| fs.rmSync(contentPath + ".fits.xml"); | ||
| } | ||
|
|
||
| async run(): Promise<void> { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -5,6 +5,7 @@ import Config from "./Config"; | |
| import { DatastreamParameters, Fedora } from "../services/Fedora"; | ||
| import FedoraDataCollector from "../services/FedoraDataCollector"; | ||
| import { execSync } from "child_process"; | ||
| import crypto = require("crypto"); | ||
| import { Agent } from "../services/interfaces"; | ||
|
|
||
| export interface ObjectParameters { | ||
|
|
@@ -65,6 +66,7 @@ export class FedoraObject { | |
| params.logMessage ?? "Adding datastream " + id + " to " + this.pid + " with " + data.length + " bytes", | ||
| ); | ||
| await this.fedora.addDatastream(this.pid, id, params, data, expectedStatus); | ||
| console.log(`Added datastream ${id} to ${this.pid}`); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it safe to remove all the console.log messages from this file as well? If you want to keep any of these messages in the long term, it would be better in this file to call |
||
| } | ||
|
|
||
| async deleteDatastream(stream: string): Promise<void> { | ||
|
|
@@ -77,7 +79,26 @@ export class FedoraObject { | |
| } | ||
|
|
||
| async addDatastreamFromFile(filename: string, stream: string, mimeType: string): Promise<void> { | ||
| await this.addDatastreamFromStringOrBuffer(fs.readFileSync(filename), stream, mimeType, [201]); | ||
| // Compute digest by streaming the file once (avoids loading the whole file into memory) | ||
| const md5Hash = crypto.createHash("md5"); | ||
| await new Promise<void>((resolve, reject) => { | ||
| const rs = fs.createReadStream(filename); | ||
| rs.on("data", (chunk: Buffer) => { | ||
| md5Hash.update(chunk); | ||
| }); | ||
| rs.on("end", () => resolve()); | ||
| rs.on("error", (err) => reject(err)); | ||
| }); | ||
| const md5 = md5Hash.digest("hex"); | ||
| const digestHeader = `md5=${md5}`; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By default, we used to provide md5 and sha512 hashes, but you're only precomputing the md5. Is that intentional or an oversight? |
||
|
|
||
| // Create a fresh read stream for the upload | ||
| const readStream = fs.createReadStream(filename); | ||
| const params: DatastreamParameters = { | ||
| mimeType: mimeType, | ||
| logMessage: "Initial Ingest addDatastream - " + stream, | ||
| }; | ||
| await this.fedora.addDatastream(this.pid, stream, params, readStream, [201], digestHeader); | ||
| } | ||
|
|
||
| async updateDatastreamFromFile(filename: string, stream: string, mimeType: string): Promise<void> { | ||
|
|
@@ -105,7 +126,21 @@ export class FedoraObject { | |
| mimeType: "text/xml", | ||
| logMessage: "Initial Ingest addDatastream - MASTER-MD", | ||
| }; | ||
| console.log("Getting fits MasterMetadata for file:", filename); | ||
| const fitsXml = this.fitsMasterMetadata(filename); | ||
|
|
||
| // Check if MASTER-MD exists and delete it if it does | ||
| try { | ||
| const checkResponse = await this.fedora.getDatastream(this.pid, "MASTER-MD"); | ||
| if (checkResponse.statusCode === 200) { | ||
| console.log("Deleting pre-existing MASTER-MD"); | ||
| await this.deleteDatastream("MASTER-MD"); | ||
| } | ||
| } catch (e) { | ||
| console.log("No existing MASTER-MD to delete:", e.message); | ||
| } | ||
|
|
||
| console.log("Adding MASTER-MD datastream"); | ||
| await this.addDatastream("MASTER-MD", params, fitsXml, [201, 204]); | ||
| } | ||
|
|
||
|
|
@@ -221,11 +256,16 @@ export class FedoraObject { | |
| return this.fedora.getDatastreamAsBuffer(this.pid, datastream); | ||
| } | ||
|
|
||
| async getDatastreamToTempFile(datastream: string, treatMissingAsEmpty = false): Promise<string> { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any particular reason the method is called |
||
| return this.fedora.downloadDatastreamToTempFile(this.pid, datastream, treatMissingAsEmpty); | ||
| } | ||
|
|
||
| async getDatastreamMetadata(datastream: string): Promise<string> { | ||
| return await this.fedora.getRdf(`${this.pid}/${datastream}/fcr:metadata`); | ||
| } | ||
|
|
||
| fitsMasterMetadata(filename: string): string { | ||
| console.log("Generating FITS metadata for " + filename); | ||
| const targetXml = filename + ".fits.xml"; | ||
| if (!fs.existsSync(targetXml)) { | ||
| const fitsCommand = this.config.fitsCommand + " -i " + filename + " -o " + targetXml; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure why these dependencies have been downgraded here; I suspect this may be some kind of merge issue and not intended as part of the PR.