Skip to content

Commit 78d0153

Browse files
authored
Optimize performance when Git is used as storage repository (#1121)
2 parents 36d2390 + a9ed0dc commit 78d0153

File tree

8 files changed

+103
-26
lines changed

8 files changed

+103
-26
lines changed

CHANGELOG.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,14 @@
22

33
All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
44

5+
## Unreleased [patch]
6+
7+
> Development of this release was supported by the [French Ministry for Foreign Affairs](https://www.diplomatie.gouv.fr/fr/politique-etrangere-de-la-france/diplomatie-numerique/) through its ministerial [State Startups incubator](https://beta.gouv.fr/startups/open-terms-archive.html) under the aegis of the Ambassador for Digital Affairs.
8+
9+
### Changed
10+
11+
- Optimize performance for Git storage
12+
513
## 2.7.1 - 2024-11-21
614

715
_Full changeset and discussions: [#1120](https://github.com/OpenTermsArchive/engine/pull/1120)._

package-lock.json

Lines changed: 14 additions & 9 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@
9696
"puppeteer-extra": "^3.3.6",
9797
"puppeteer-extra-plugin-stealth": "^2.11.2",
9898
"sib-api-v3-sdk": "^8.2.1",
99-
"simple-git": "^3.8.0",
99+
"simple-git": "^3.27.0",
100100
"swagger-jsdoc": "^6.2.8",
101101
"swagger-ui-express": "^5.0.0",
102102
"winston": "^3.9.0",

src/archivist/index.js

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ export const EVENTS = [
2828
'trackingStarted',
2929
'trackingCompleted',
3030
'inaccessibleContent',
31+
'info',
3132
'error',
3233
'pluginError',
3334
];
@@ -45,6 +46,7 @@ export default class Archivist extends events.EventEmitter {
4546
}
4647

4748
async initialize() {
49+
this.emit('info', 'Initializing engine…');
4850
if (this.services) {
4951
return;
5052
}
@@ -67,6 +69,8 @@ export default class Archivist extends events.EventEmitter {
6769
process.exit(1);
6870
});
6971

72+
this.emit('info', 'Initialization completed');
73+
7074
return this;
7175
}
7276

@@ -140,7 +144,13 @@ export default class Archivist extends events.EventEmitter {
140144
return;
141145
}
142146

143-
return this.recordVersion(terms, extractOnly);
147+
await this.recordVersion(terms, extractOnly);
148+
149+
terms.sourceDocuments.forEach(sourceDocument => {
150+
sourceDocument.content = null; // Reduce memory usage by clearing no longer needed large content strings
151+
sourceDocument.mimeType = null; // …and associated MIME type
152+
sourceDocument.snapshotId = null; // …and associated snapshot ID for consistency
153+
});
144154
}
145155

146156
async fetchSourceDocuments(terms) {

src/archivist/recorder/repositories/git/git.js

Lines changed: 23 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,10 @@ export default class Git {
1818
await fs.mkdir(this.path, { recursive: true });
1919
}
2020

21-
this.git = simpleGit(this.path, { maxConcurrentProcesses: 1 });
21+
this.git = simpleGit(this.path, {
22+
trimmed: true,
23+
maxConcurrentProcesses: 1,
24+
});
2225

2326
await this.git.init();
2427

@@ -27,7 +30,8 @@ export default class Git {
2730
.addConfig('push.default', 'current')
2831
.addConfig('user.name', this.author.name)
2932
.addConfig('user.email', this.author.email)
30-
.addConfig('core.quotePath', false); // disable Git's encoding of special characters in pathnames. For example, `service·A` will be encoded as `service\302\267A` without this setting, leading to issues. See https://git-scm.com/docs/git-config#Documentation/git-config.txt-corequotePath
33+
.addConfig('core.quotePath', false) // Disable Git's encoding of special characters in pathnames. For example, `service·A` will be encoded as `service\302\267A` without this setting, leading to issues. See https://git-scm.com/docs/git-config#Documentation/git-config.txt-corequotePath
34+
.addConfig('core.commitGraph', true); // Enable `commit-graph` feature for efficient commit data storage, improving performance of operations like `git log`
3135
}
3236

3337
add(filePath) {
@@ -42,7 +46,7 @@ export default class Git {
4246
process.env.GIT_AUTHOR_DATE = commitDate;
4347
process.env.GIT_COMMITTER_DATE = commitDate;
4448

45-
summary = await this.git.commit(message, filePath);
49+
summary = await this.git.commit(message, filePath, ['--no-verify']); // Skip pre-commit and commit-msg hooks, as commits are programmatically managed, to optimize performance
4650
} finally {
4751
process.env.GIT_AUTHOR_DATE = '';
4852
process.env.GIT_COMMITTER_DATE = '';
@@ -60,11 +64,11 @@ export default class Git {
6064
}
6165

6266
listCommits(options = []) {
63-
return this.log([ '--reverse', '--no-merges', '--name-only', ...options ]);
67+
return this.log([ '--reverse', '--no-merges', '--name-only', ...options ]); // Returns all commits in chronological order (`--reverse`), excluding merge commits (`--no-merges`), with modified files names (`--name-only`)
6468
}
6569

6670
async getCommit(options) {
67-
const [commit] = await this.listCommits([ '-1', ...options ]);
71+
const [commit] = await this.listCommits([ '-1', ...options ]); // Returns only the most recent commit matching the given options
6872

6973
return commit;
7074
}
@@ -103,8 +107,8 @@ export default class Git {
103107
return this.git.clean('f', '-d');
104108
}
105109

106-
async getFullHash(shortHash) {
107-
return (await this.git.show([ shortHash, '--pretty=%H', '-s' ])).trim();
110+
getFullHash(shortHash) {
111+
return this.git.show([ shortHash, '--pretty=%H', '-s' ]);
108112
}
109113

110114
restore(path, commit) {
@@ -120,4 +124,16 @@ export default class Git {
120124
relativePath(absolutePath) {
121125
return path.relative(this.path, absolutePath); // Git needs a path relative to the .git directory, not an absolute one
122126
}
127+
128+
async listFiles(path) {
129+
return (await this.git.raw([ 'ls-files', path ])).split('\n');
130+
}
131+
132+
async writeCommitGraph() {
133+
await this.git.raw([ 'commit-graph', 'write', '--reachable', '--changed-paths' ]);
134+
}
135+
136+
async updateCommitGraph() {
137+
await this.git.raw([ 'commit-graph', 'write', '--reachable', '--changed-paths', '--append' ]);
138+
}
123139
}

src/archivist/recorder/repositories/git/index.js

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ export default class GitRepository extends RepositoryInterface {
2929
async initialize() {
3030
await this.git.initialize();
3131
await this.git.cleanUp(); // Drop all uncommitted changes and remove all leftover files that may be present if the process was killed aggressively
32+
await this.git.writeCommitGraph(); // Create or replace the commit graph with a new one to ensure it's fully consistent
3233

3334
return this;
3435
}
@@ -56,17 +57,22 @@ export default class GitRepository extends RepositoryInterface {
5657
return record;
5758
}
5859

59-
finalize() {
60-
if (!this.needsPublication) {
61-
return;
60+
async finalize() {
61+
if (this.needsPublication) {
62+
await this.git.pushChanges();
6263
}
6364

64-
return this.git.pushChanges();
65+
return this.git.updateCommitGraph();
6566
}
6667

6768
async findLatest(serviceId, termsType, documentId) {
68-
const filePath = DataMapper.generateFilePath(serviceId, termsType, documentId);
69-
const commit = await this.git.getCommit([filePath]);
69+
const matchingFilesPaths = await this.git.listFiles(DataMapper.generateFilePath(serviceId, termsType, documentId));
70+
71+
if (!matchingFilesPaths.length) {
72+
return null;
73+
}
74+
75+
const commit = await this.git.getCommit([...matchingFilesPaths]); // Returns the most recent commit that modified any of the matching files. If multiple files match the path pattern (e.g. both HTML and PDF versions exist), returns the commit that last modified any of them
7076

7177
return this.#toDomain(commit);
7278
}

src/logger/index.js

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ import config from 'config';
44
import winston from 'winston';
55
import 'winston-mail';
66

7+
import { formatDuration } from './utils.js';
8+
79
const { combine, timestamp, printf, colorize } = winston.format;
810

911
const alignedWithColorsAndTime = combine(
@@ -82,6 +84,7 @@ logger.configure({
8284

8385
let recordedSnapshotsCount;
8486
let recordedVersionsCount;
87+
let trackingStartTime;
8588

8689
logger.onFirstSnapshotRecorded = ({ serviceId, termsType, documentId, id }) => {
8790
logger.info({ message: `Recorded first snapshot with id ${id}`, serviceId, termsType, documentId });
@@ -119,14 +122,17 @@ logger.onTrackingStarted = (numberOfServices, numberOfTerms, extractOnly) => {
119122
}
120123
recordedSnapshotsCount = 0;
121124
recordedVersionsCount = 0;
125+
trackingStartTime = Date.now();
122126
};
123127

124128
logger.onTrackingCompleted = (numberOfServices, numberOfTerms, extractOnly) => {
129+
const duration = formatDuration(Date.now() - trackingStartTime);
130+
125131
if (extractOnly) {
126-
logger.info(`Examined ${numberOfTerms} terms from ${numberOfServices} services for extraction`);
132+
logger.info(`Examined ${numberOfTerms} terms from ${numberOfServices} services for extraction in ${duration}`);
127133
logger.info(`Recorded ${recordedVersionsCount} new versions\n`);
128134
} else {
129-
logger.info(`Tracked changes of ${numberOfTerms} terms from ${numberOfServices} services`);
135+
logger.info(`Tracked changes of ${numberOfTerms} terms from ${numberOfServices} services in ${duration}`);
130136
logger.info(`Recorded ${recordedSnapshotsCount} new snapshots and ${recordedVersionsCount} new versions\n`);
131137
}
132138
};
@@ -139,6 +145,10 @@ logger.onError = (error, terms) => {
139145
logger.error({ message: error.stack, serviceId: terms.service.id, termsType: terms.type });
140146
};
141147

148+
logger.onInfo = message => {
149+
logger.info({ message });
150+
};
151+
142152
logger.onPluginError = (error, pluginName) => {
143153
logger.error({ message: `Error in "${pluginName}" plugin: ${error.stack}` });
144154
};

src/logger/utils.js

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
export const formatDuration = milliseconds => {
2+
const seconds = Math.floor(milliseconds / 1000);
3+
const hours = Math.floor(seconds / 3600);
4+
const minutes = Math.floor((seconds % 3600) / 60);
5+
const remainingSeconds = seconds % 60;
6+
7+
const parts = [];
8+
9+
if (hours > 0) {
10+
parts.push(`${hours} hour${hours > 1 ? 's' : ''}`);
11+
}
12+
13+
if (minutes > 0) {
14+
parts.push(`${minutes} minute${minutes > 1 ? 's' : ''}`);
15+
}
16+
17+
if (remainingSeconds > 0 || parts.length === 0) {
18+
parts.push(`${remainingSeconds} second${remainingSeconds !== 1 ? 's' : ''}`);
19+
}
20+
21+
return parts.join(' and ');
22+
};

0 commit comments

Comments
 (0)