From e633fa1a8e5144904febe3ac59d5e8cf6ab2415b Mon Sep 17 00:00:00 2001 From: Chris Bradley Date: Wed, 10 Jan 2024 15:38:24 +0000 Subject: [PATCH] Update caching --- src/gribjump/GribInfoCache.cc | 76 +----------------- src/gribjump/GribInfoCache.h | 10 --- src/gribjump/LocalGribJump.cc | 12 +-- src/gribjump/LocalGribJump.h | 2 - src/tools/CMakeLists.txt | 4 +- src/tools/gj-cache.cc | 144 ---------------------------------- src/tools/infofromfile.cc | 99 +++++++++++++++++++++++ 7 files changed, 105 insertions(+), 242 deletions(-) delete mode 100644 src/tools/gj-cache.cc create mode 100644 src/tools/infofromfile.cc diff --git a/src/gribjump/GribInfoCache.cc b/src/gribjump/GribInfoCache.cc index 2708be0..4d3cabd 100644 --- a/src/gribjump/GribInfoCache.cc +++ b/src/gribjump/GribInfoCache.cc @@ -27,23 +27,6 @@ GribInfoCache::GribInfoCache(){} GribInfoCache::GribInfoCache(eckit::PathName dir) : cacheDir_(dir) { ASSERT(cacheDir_.exists()); - const eckit::PathName path = cacheDir_ / "manifest.gj"; - if(path.exists()){ - eckit::FileStream s(path, "r"); - s >> manifest_; - s.close(); - } -} - -void GribInfoCache::preload() { - for (auto& entry : manifest_) { - const eckit::PathName infopath = cacheDir_ / entry.second; - eckit::FileStream s(infopath, "r"); - std::map cache; - s >> cache; - s.close(); - cache_.merge(cache); - } } bool GribInfoCache::contains(const fdb5::FieldLocation& loc) { @@ -57,20 +40,14 @@ bool GribInfoCache::contains(const fdb5::FieldLocation& loc) { return true; } - // Check if field's filename is in manifest - const auto el = manifest_.find(fdbfilename); - if (el == manifest_.end()) { - return false; - } - // Check if gribinfo cache file exists (i.e. manifest is not stale) - eckit::PathName infopath = cacheDir_ / el->second; + eckit::PathName infopath = cacheDir_ / fdbfilename + ".gj"; if (!infopath.exists()) { return false; } - // This field is in the cache, but not in memory. Load it. - eckit::Log::debug() << "Merging " << infopath << " with cache" << std::endl; + // Field should be cached on disk, but is not in memory. + eckit::Log::debug() << "Loading " << infopath << " into cache" << std::endl; eckit::FileStream s(infopath, "r"); std::map cache; s >> cache; @@ -96,10 +73,6 @@ void GribInfoCache::print(std::ostream& s) const { // Print the manifest, then the cache s << "GribInfoCache["; s << "cacheDir=" << cacheDir_ << std::endl; - s << "#entries=" << manifest_.size() << std::endl; - for (auto& entry : manifest_) { - s << entry.first << " -> " << entry.second << std::endl; - } s << "cache=" << std::endl; for (auto& entry : cache_) { s << entry.first << " -> " << entry.second << std::endl; @@ -107,48 +80,5 @@ void GribInfoCache::print(std::ostream& s) const { s << "]"; } -void GribInfoCache::removeOld(int Ndays){ - // remove entries in manifest older than Ndays - // and delete the corresponding gribinfo files. - - eckit::Date now(eckit::TimeStamp("%Y-%m-%d")); - std::vector toRemove; - - for (auto& entry : manifest_) { - // Get date from filename - std::string timestamp = entry.second.substr(0, 10); - eckit::Date date(timestamp); - - if (now - date > Ndays) { - eckit::PathName infopath = cacheDir_ / entry.second; - toRemove.push_back(entry.first); - } - } - - for (auto& key : toRemove) { - eckit::PathName infopath = cacheDir_ / manifest_.at(key); - // Paranoia: ensure the file ends with .gj before deleting. - ASSERT(infopath.baseName().extension() == ".gj"); - infopath.unlink(); - manifest_.erase(key); - eckit::Log::debug() << "Removed " << infopath << " from manifest" << std::endl; - } -} - -void GribInfoCache::dump() const{ - // Dump the manifest to disk, overwriting the old one. - eckit::PathName manifestpath = cacheDir_ / "manifest.gj"; - eckit::FileStream s(manifestpath, "w"); - s << manifest_; - s.close(); -} - -bool GribInfoCache::lookup(const std::string& fdbfilename) const{ - // Check if field's filename is in manifest - return manifest_.count(fdbfilename) != 0; -} -void GribInfoCache::append(const std::string& fdbfilename, const std::string& gribinfofilename){ - manifest_[fdbfilename] = gribinfofilename; -} } // namespace gribjump diff --git a/src/gribjump/GribInfoCache.h b/src/gribjump/GribInfoCache.h index f630ae6..2d2b32a 100644 --- a/src/gribjump/GribInfoCache.h +++ b/src/gribjump/GribInfoCache.h @@ -35,23 +35,13 @@ class GribInfoCache { // Get gribinfo from memory const JumpInfo& get(const fdb5::FieldLocation& loc); - // Preload all gribinfos listed in manifest into memory - void preload(); - void print(std::ostream& s) const; - // Manifest maintenance - bool lookup(const std::string& fdbfilename) const; - void append(const std::string& fdbfilename, const std::string& gribinfofilename); - void removeOld(int days); - void dump() const; - private: eckit::PathName cacheDir_; // fieldlocation's fdb filename -> gribinfo filename - std::map manifest_; // fieldlocation's full name -> gribinfo std::map cache_; diff --git a/src/gribjump/LocalGribJump.cc b/src/gribjump/LocalGribJump.cc index eca1eeb..56b0309 100644 --- a/src/gribjump/LocalGribJump.cc +++ b/src/gribjump/LocalGribJump.cc @@ -35,12 +35,6 @@ LocalGribJump::LocalGribJump(const Config& config): GribJumpBase(config) { eckit::Log::debug() << "GribJump not using cache" << std::endl; return; } - eckit::PathName manifestPath = eckit::PathName(cacheDir) / "manifest.gj"; - if (!manifestPath.exists()) { - eckit::Log::warning() << "Warning " << manifestPath << " does not exist." << std::endl; - eckit::Log::debug() << "GribJump not using cache" << std::endl; - return; - } eckit::Log::debug() << "GribJump is using cache" << std::endl; cache_ = GribInfoCache(cacheDir); @@ -139,14 +133,10 @@ ExtractionResult LocalGribJump::directJump(eckit::DataHandle* handle, return info.extractRanges(dataSource, ranges); } -bool LocalGribJump::isCached(std::string key) const { - NOTIMP; -} - JumpInfo LocalGribJump::extractInfo(const fdb5::FieldLocation& loc) { if (cacheEnabled_) { if(cache_.contains(loc)) return cache_.get(loc); - eckit::Log::debug() << "GribJump::extractInfo() cache miss" << std::endl; + eckit::Log::debug() << "GribJump::extractInfo() cache miss for file " << loc.uri().path().baseName() << std::endl; } eckit::DataHandle* handle = loc.dataHandle(); diff --git a/src/gribjump/LocalGribJump.h b/src/gribjump/LocalGribJump.h index a86f80b..a8df615 100644 --- a/src/gribjump/LocalGribJump.h +++ b/src/gribjump/LocalGribJump.h @@ -29,8 +29,6 @@ class LocalGribJump : public GribJumpBase { // JumpInfo extractInfo(eckit::DataHandle* handle) const; JumpInfo extractInfo(const fdb5::FieldLocation& loc); - bool isCached(std::string) const; - std::map> axes(const std::string& request) override; diff --git a/src/tools/CMakeLists.txt b/src/tools/CMakeLists.txt index a03b92f..b388d53 100644 --- a/src/tools/CMakeLists.txt +++ b/src/tools/CMakeLists.txt @@ -28,8 +28,8 @@ ecbuild_add_executable( TARGET gj-testc LIBS gribjump ) -ecbuild_add_executable( TARGET gj-cache - SOURCES gj-cache.cc +ecbuild_add_executable( TARGET gribinfo-fromfile + SOURCES infofromfile.cc INCLUDES ${ECKIT_INCLUDE_DIRS} LIBS gribjump ) \ No newline at end of file diff --git a/src/tools/gj-cache.cc b/src/tools/gj-cache.cc deleted file mode 100644 index 4e9e785..0000000 --- a/src/tools/gj-cache.cc +++ /dev/null @@ -1,144 +0,0 @@ -/* - * (C) Copyright 1996- ECMWF. - * - * This software is licensed under the terms of the Apache Licence Version 2.0 - * which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. - * In applying this licence, ECMWF does not waive the privileges and immunities - * granted to it by virtue of its status as an intergovernmental organisation nor - * does it submit to any jurisdiction. - */ - -#include "eckit/runtime/Application.h" -#include "eckit/serialisation/FileStream.h" -#include "eckit/log/Log.h" -#include "eckit/log/TimeStamp.h" -#include "eckit/types/Date.h" -#include "eckit/option/CmdArgs.h" -#include "eckit/option/SimpleOption.h" -#include "eckit/value/Value.h" - - -#include "fdb5/api/FDB.h" -#include "fdb5/api/helpers/FDBToolRequest.h" -#include "fdb5/tools/FDBTool.h" - -#include "gribjump/GribHandleData.h" -#include "gribjump/GribInfoCache.h" -#include "gribjump/LibGribJump.h" -#include "gribjump/Config.h" - -namespace gribjump { - - -// TODO Probably doesn't need to be an FDBTool -class CacheTool : public fdb5::FDBTool { - virtual void execute(const eckit::option::CmdArgs &args); - virtual void usage(const std::string &tool) const; - virtual int numberOfPositionalArguments() const { return 1; } - public: - CacheTool(int argc, char **argv): fdb5::FDBTool(argc, argv) { - options_.push_back(new eckit::option::SimpleOption( - "lifetime", "Clean up cache files older than this many days.")); - } -}; - -void CacheTool::usage(const std::string &tool) const { - eckit::Log::info() << std::endl - << "Usage: " << tool << " req_str" << std::endl; - - fdb5::FDBTool::usage(tool); -} - -void CacheTool::execute(const eckit::option::CmdArgs &args) { - // This tool should do the following: - // 1. Use fdb list to get all fieldlocations of levtype=sfc - // 2. For each fieldlocation, get the gribinfo - // 3. Add each gribinfo to the cache object - // 4. Write the cache object to disk - - if (!getenv("GRIBJUMP_CONFIG_FILE")) { - eckit::Log::error() << "GRIBJUMP_CONFIG_FILE not set." << std::endl; - return; - } - - eckit::PathName configPath(getenv("GRIBJUMP_CONFIG_FILE")); - Config config(configPath); - - std::string str; - config.get("cache", str); - eckit::PathName cacheDir(str); - eckit::PathName manifestpath = cacheDir / eckit::PathName("manifest.gj"); - - if (!cacheDir.exists()) { - eckit::Log::error() << "Cache directory " << cacheDir << " does not exist." << std::endl; - return; - } - GribInfoCache gribinfoCache(cacheDir); - unsigned long lifetime = args.getInt("lifetime", 5); - gribinfoCache.removeOld(lifetime); - fdb5::FDB fdb; - std::string req = args(0); - std::vector x = fdb5::FDBToolRequest::requestsFromString(req); - - ASSERT(x.size() == 1); - - fdb5::ListIterator it = fdb.list(x[0]); - fdb5::ListElement el; - std::map> newInfos; - int nfields = 0; - while (it.next(el)) { - const fdb5::FieldLocation& loc = el.location(); - std::string fdbfilename = loc.uri().path().baseName(); - - // check if this file is already in cache - if (gribinfoCache.lookup(fdbfilename)) { - continue; - } - - eckit::DataHandle* handle = loc.dataHandle(); - JumpHandle dataSource(handle); - JumpInfo info = dataSource.extractInfo(); - std::string offset = std::to_string(loc.offset()); - std::string key = fdbfilename + "." + offset; - newInfos[fdbfilename][key] = info; - nfields++; - } - - // Write GribInfo files to disk, and update manifest - for (const auto& kv : newInfos) { - const std::string& fieldfilename = kv.first; - const std::map& cache = kv.second; - // const std::string cachefilename = "2023-12-01" + fieldfilename + ".gj"; - const std::string cachefilename = std::string(eckit::TimeStamp("%Y-%m-%d.")) + fieldfilename + ".gj"; - const eckit::PathName cachepath = cacheDir / cachefilename; - eckit::FileStream s(cachepath, "w"); - s << cache; - s.close(); - - gribinfoCache.append(fieldfilename, cachefilename); - } - - // persist manifest - gribinfoCache.dump(); - - eckit::Log::debug() << "Generated GribInfo for " << nfields << " new fields in " << newInfos.size() << " files." << std::endl; - - - // Debug: Try reading it back in and creating GribInfoCache - // GribInfoCache cache(cacheDir); - // cache.print(std::cout); - // cache.preload(); - // cache.print(std::cout); - -} - -//---------------------------------------------------------------------------------------------------------------------- - -} // namespace gribjump - - -int main(int argc, char** argv) { - gribjump::CacheTool app(argc, argv); - app.start(); - return 0; -} \ No newline at end of file diff --git a/src/tools/infofromfile.cc b/src/tools/infofromfile.cc new file mode 100644 index 0000000..b579608 --- /dev/null +++ b/src/tools/infofromfile.cc @@ -0,0 +1,99 @@ +/* + * (C) Copyright 1996- ECMWF. + * + * This software is licensed under the terms of the Apache Licence Version 2.0 + * which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. + * In applying this licence, ECMWF does not waive the privileges and immunities + * granted to it by virtue of its status as an intergovernmental organisation nor + * does it submit to any jurisdiction. + */ + +#include "eckit/runtime/Application.h" +#include "eckit/serialisation/FileStream.h" +#include "eckit/log/Log.h" +#include "eckit/log/TimeStamp.h" +#include "eckit/types/Date.h" +#include "eckit/option/CmdArgs.h" +#include "eckit/option/SimpleOption.h" +#include "eckit/value/Value.h" +#include "eckit/utils/StringTools.h" + + +#include "fdb5/api/FDB.h" +#include "fdb5/api/helpers/FDBToolRequest.h" +#include "fdb5/tools/FDBTool.h" + +#include "gribjump/GribHandleData.h" +#include "gribjump/GribInfoCache.h" +#include "gribjump/LibGribJump.h" +#include "gribjump/Config.h" + +namespace gribjump { + + +// TODO Probably doesn't need to be an FDBTool +class FileCacher : public fdb5::FDBTool { + virtual void execute(const eckit::option::CmdArgs &args); + virtual void usage(const std::string &tool) const; + virtual int numberOfPositionalArguments() const { return 2; } + public: + FileCacher(int argc, char **argv): fdb5::FDBTool(argc, argv) { + options_.push_back(new eckit::option::SimpleOption("outdir", "Directory to write cache files to")); + } +}; + +void FileCacher::usage(const std::string &tool) const { + eckit::Log::info() << std::endl + << "Usage: " << tool << " filename.data 100,200" << std::endl; + + fdb5::FDBTool::usage(tool); +} + +void FileCacher::execute(const eckit::option::CmdArgs &args) { + // This tool should do the following: + // 1. Take a filepath as input. + // Also take a list of offsets, corresponding to the start of each field in the file. + // 2. Create a map of filename+offset : GribInfos for each field in the file. + // 3. Write the GribInfos to disk, with a filename based on the input filename. + + eckit::PathName fdbfilename(args(0)); + std::vector offsets = eckit::StringTools::split(",", args(1)); + + std::mapinfos; + int nfields = 0; + + eckit::DataHandle* handle = fdbfilename.fileHandle(); + JumpHandle dataSource(handle); + + for (const auto& offset : offsets) { + size_t off = std::stoll(offset); + dataSource.seek(off); + JumpInfo info = dataSource.extractInfo(); + std::string key = fdbfilename.baseName() + "." + offset; + infos[key] = info; + nfields++; + } + eckit::PathName outdir = args.getString("outdir", "."); + + const std::string infosfilename = fdbfilename.baseName() + ".gj"; + const eckit::PathName cachepath = outdir/infosfilename; + eckit::FileStream s(cachepath, "w"); + s << infos; + s.close(); + + eckit::Log::debug() << "Generated GribInfo for " << nfields << " new fields in " << cachepath << std::endl; + + ASSERT(nfields == offsets.size()); + +} + +//---------------------------------------------------------------------------------------------------------------------- + +} // namespace gribjump + + +int main(int argc, char** argv) { + gribjump::FileCacher app(argc, argv); + app.start(); + return 0; +} \ No newline at end of file