Skip to content

Commit

Permalink
Prevent new gathers if working area is low on free space
Browse files Browse the repository at this point in the history
This should make crawls queue up rather than creating a whole of failed
(or worse, 'successful' but empty) crawls in the case the working area
runs out of space.
  • Loading branch information
ato committed Jan 16, 2025
1 parent 5f786a2 commit 9b3ee17
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 1 deletion.
11 changes: 11 additions & 0 deletions gatherer/src/pandas/gatherer/core/Config.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ public class Config {
private String oidcClientSecret;
private Path legacyScripts;

// Browsertrix stops at 10%, so let's ensure there's a little more than that
private double minFreeSpacePercent = 11.0;

public Config() {
}

Expand Down Expand Up @@ -284,4 +287,12 @@ public Path getLegacyScripts() {
public void setLegacyScripts(Path legacyScripts) {
this.legacyScripts = legacyScripts;
}

public double getMinFreeSpacePercent() {
return minFreeSpacePercent;
}

public void setMinFreeSpacePercent(double minFreeSpacePercent) {
this.minFreeSpacePercent = minFreeSpacePercent;
}
}
15 changes: 14 additions & 1 deletion gatherer/src/pandas/gatherer/core/GatherManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import jakarta.annotation.PreDestroy;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.context.Lifecycle;
import org.springframework.context.SmartLifecycle;
import org.springframework.stereotype.Component;
import pandas.collection.Title;
Expand Down Expand Up @@ -44,6 +43,7 @@ public class GatherManager implements AutoCloseable, SmartLifecycle {
private volatile boolean paused;
private boolean running;
private ScheduledFuture<?> updateGatherStatsTask;
private long lastSpaceWarningTime = 0;

public GatherManager(Config config, WorkingArea workingArea, InstanceRepository instanceRepository,
TitleRepository titleRepository, InstanceService instanceService,
Expand Down Expand Up @@ -123,6 +123,19 @@ Instance nextInstance(String gatherMethod, String threadName) {
return instance;
}

// Ensure there's enough free space in working area to allow starting of new gathers.
double freeSpacePercent = workingArea.getFreeSpacePercent();
if (freeSpacePercent < config.getMinFreeSpacePercent()) {
long now = System.currentTimeMillis();
if (lastSpaceWarningTime + 1000 * 60 < now) {
lastSpaceWarningTime = now;
log.warn("Preventing new gathers because free space percentage for {} is below the minimum " +
"threshold ({}%). Current: {}%",
workingArea.getPath(), config.getMinFreeSpacePercent(), freeSpacePercent);
}
return null;
}

// now look for titles scheduled for a new gather
// XXX: we ignore any titles that were last gathered within the current minute
// this is to ensure that we don't generate an instance with the same datestring
Expand Down
18 changes: 18 additions & 0 deletions gatherer/src/pandas/gatherer/core/WorkingArea.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import pandas.gatherer.httrack.Pandora2Warc;
import pandas.gatherer.repository.Repository;

import java.io.File;
import java.io.IOException;
import java.nio.file.*;
import java.nio.file.attribute.BasicFileAttributes;
Expand Down Expand Up @@ -264,4 +265,21 @@ public void cleanupToDelete() {
public Path getInstanceDir(long pi, String dateString) {
return workingdir.resolve(String.valueOf(pi)).resolve(dateString).toAbsolutePath();
}

public double getFreeSpacePercent() {
File file = workingdir.toFile();
long totalSpace = file.getTotalSpace();
long freeSpace = file.getFreeSpace();

if (totalSpace == 0) {
log.warn("Unable to calculate free space percentage. Total space is 0.");
return 50.0;
}

return ((double) freeSpace / totalSpace) * 100;
}

public Path getPath() {
return workingdir;
}
}

0 comments on commit 9b3ee17

Please sign in to comment.