Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions jvector-examples/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,11 @@
<version>7.3.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>io.nosqlbench</groupId>
<artifactId>datatools-vectordata</artifactId>
<version>0.1.22</version>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-api</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,6 @@ public interface DataSetLoader {
* @return a {@link DataSet}, if found
*/
Optional<DataSet> loadDataSet(String dataSetName);

String getName();
}
Original file line number Diff line number Diff line change
Expand Up @@ -41,17 +41,63 @@
* This dataset loader will get and load hdf5 files from <a href="https://ann-benchmarks.com/">ann-benchmarks</a>.
*/
public class DataSetLoaderHDF5 implements DataSetLoader {
private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(DataSetLoaderHDF5.class);
public static final Path HDF5_DIR = Path.of("hdf5");
private static final VectorTypeSupport vectorTypeSupport = VectorizationProvider.getInstance().getVectorTypeSupport();
public static final String HDF5_EXTN = ".hdf5";

public static final String NAME = "HDF5";
public String getName() {
return NAME;
}

private static final java.util.Set<String> KNOWN_DATASETS = java.util.Set.of(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not a representative set of datasets. Let's align this better with datasets.yml. Also, we do not support the jaccard metric. These datasets should be removed.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was not able load any of these using BenchYAML or AutoBenchYAML due to a missing catalogs.yaml. If this is supposed to be supplied by the user, please add it to our documentation. If not, please provide a reasonable default (probably in either case).

"deep-image-96-angular",
"fashion-mnist-784-euclidean",
"gist-960-euclidean",
"glove-25-angular",
"glove-50-angular",
"glove-100-angular",
"glove-200-angular",
"kosarak-jaccard",
"mnist-784-euclidean",
"movielens10m-jaccard",
"nytimes-256-angular",
"sift-128-euclidean",
"lastfm-64-dot",
"coco-i2i-512-angular",
"coco-t2i-512-angular"
);


/**
* {@inheritDoc}
*/
public Optional<DataSet> loadDataSet(String datasetName) {

// HDF5 loader does not support profiles
if (datasetName.contains(":")) {
logger.trace("Dataset '{}' has a profile, which is not supported by the HDF5 loader.", datasetName);
return Optional.empty();
}

// If not local, only download if it's explicitly known to be on ann-benchmarks.com
if (!KNOWN_DATASETS.contains(datasetName)) {
logger.trace("Dataset '{}' not in known list, skipping HDF5 download.", datasetName);
return Optional.empty();
}

// If it exists locally, we're good
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if it would make sense to pub this check before the KNOWN_DATASETS check as a way of allowing the user to add their own hdf5 datasets that are not part of the canonical set

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. If the user adds a dataset locally, it should always take precedence over the other available sources.

var dsFilePath = HDF5_DIR.resolve(datasetName + HDF5_EXTN);
if (Files.exists(dsFilePath)) {
logger.trace("Dataset '{}' already downloaded.", datasetName);
return Optional.of(readHdf5Data(dsFilePath));
}

return maybeDownloadHdf5(datasetName).map(this::readHdf5Data);
}


private DataSet readHdf5Data(Path path) {

// infer the similarity
Expand Down Expand Up @@ -114,16 +160,12 @@ else if (filename.toString().contains("-euclidean")) {
}

private Optional<Path> maybeDownloadHdf5(String datasetName) {

var dsFilePath = HDF5_DIR.resolve(datasetName+HDF5_EXTN);

if (Files.exists(dsFilePath)) {
return Optional.of(dsFilePath);
}
var dsFilePath = HDF5_DIR.resolve(datasetName + HDF5_EXTN);

// Download from https://ann-benchmarks.com/datasetName
var url = "https://ann-benchmarks.com/" + datasetName + HDF5_EXTN;
System.out.println("Downloading: " + url);
logger.info("Downloading: {}", url);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this not come after the file is found? Currently this prints for every dataset, even non-hdf5, which is annoying. I realize you put checks in for dataset existence before we get here but it could still print this, then get an HTTP_NOT_FOUND and return Optional.empty

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ditto comment on #637!



HttpURLConnection connection;
while (true) {
Expand All @@ -139,7 +181,7 @@ private Optional<Path> maybeDownloadHdf5(String datasetName) {
}
if (responseCode == HttpURLConnection.HTTP_MOVED_PERM || responseCode == HttpURLConnection.HTTP_MOVED_TEMP) {
String newUrl = connection.getHeaderField("Location");
System.out.println("Redirect detected to URL: " + newUrl);
logger.info("Redirect detected to URL: {}", newUrl);
url = newUrl;
} else {
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,21 @@ public class DataSetLoaderMFD implements DataSetLoader {
private static final String bucketName = "astra-vector";
private static final List<String> bucketNames = List.of(bucketName, infraBucketName);

public static final String NAME = "MFD";
public String getName() {
return NAME;
}

/**
* {@inheritDoc}
*/
public Optional<DataSet> loadDataSet(String fileName) {

if (fileName.contains(":")) {
logger.trace("Dataset {} with profile is not supported by MFD loader", fileName);
return Optional.empty();
}

return maybeDownloadFvecs(fileName).map(MultiFileDatasource::load);
}

Expand Down
Loading