Skip to content

Commit df6267f

Browse files
etiennebarriebyroot
andcommitted
Revalidate stale entries using a digest.
Ref: #336 Bootsnap was initially designed for improving boot time in development, so it was logical to use `mtime` to detect changes given that's reliable on a given machine. But is just as useful on production and CI environments, however there its hit rate can vary a lot because depending on how the source code and caches are saved and restored, many if not all `mtime` will have changed. To improve this, we can first try to revalidate using the `mtime`, and if it fails, fallback to compare a digest of the file content. Digesting a file, even with `fnv1a_64` is of course an overhead, but the assumption is that true misses should be relatively rare and that digesting the file will always be faster than compiling it. So even if it only improve the hit rate marginally, it should be faster overall. Also we only recompute the digest if the file mtime changed, but its size remained the same, which should discard the overwhelming majority of legitimate source file changes. Co-authored-by: Jean Boussier <jean.boussier@gmail.com>
1 parent c12002f commit df6267f

File tree

7 files changed

+197
-37
lines changed

7 files changed

+197
-37
lines changed

CHANGELOG.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# Unreleased
22

3+
* Revalidate stale cache entries by digesting the source content.
4+
This should significantly improve performance in environments where `mtime` isn't preserved (e.g. CI systems doing a git clone, etc).
5+
See #468.
36
* Open source files and cache entries with `O_NOATIME` when available to reduce disk accesses. See #469.
47
* `bootsnap precompile --gemfile` now look for `.rb` files in the whole gem and not just the `lib/` directory. See #466.
58

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ Bootsnap cache misses can be monitored though a callback:
9999
Bootsnap.instrumentation = ->(event, path) { puts "#{event} #{path}" }
100100
```
101101

102-
`event` is either `:miss` or `:stale`. You can also call `Bootsnap.log!` as a shortcut to
102+
`event` is either `:miss`, `:stale` or `:revalidated`. You can also call `Bootsnap.log!` as a shortcut to
103103
log all events to STDERR.
104104

105105
To turn instrumentation back off you can set it to nil:

ext/bootsnap/bootsnap.c

Lines changed: 142 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,10 @@ struct bs_cache_key {
5858
uint32_t ruby_revision;
5959
uint64_t size;
6060
uint64_t mtime;
61-
uint64_t data_size; /* not used for equality */
62-
uint8_t pad[24];
61+
uint64_t data_size; //
62+
uint64_t digest;
63+
uint8_t digest_set;
64+
uint8_t pad[15];
6365
} __attribute__((packed));
6466

6567
/*
@@ -73,7 +75,7 @@ struct bs_cache_key {
7375
STATIC_ASSERT(sizeof(struct bs_cache_key) == KEY_SIZE);
7476

7577
/* Effectively a schema version. Bumping invalidates all previous caches */
76-
static const uint32_t current_version = 4;
78+
static const uint32_t current_version = 5;
7779

7880
/* hash of e.g. "x86_64-darwin17", invalidating when ruby is recompiled on a
7981
* new OS ABI, etc. */
@@ -93,6 +95,7 @@ static VALUE rb_cBootsnap_CompileCache_UNCOMPILABLE;
9395
static ID instrumentation_method;
9496
static VALUE sym_miss;
9597
static VALUE sym_stale;
98+
static VALUE sym_revalidated;
9699
static bool instrumentation_enabled = false;
97100
static bool readonly = false;
98101

@@ -104,9 +107,18 @@ static VALUE bs_rb_fetch(VALUE self, VALUE cachedir_v, VALUE path_v, VALUE handl
104107
static VALUE bs_rb_precompile(VALUE self, VALUE cachedir_v, VALUE path_v, VALUE handler);
105108

106109
/* Helpers */
110+
enum cache_status {
111+
miss,
112+
hit,
113+
stale,
114+
};
107115
static void bs_cache_path(const char * cachedir, const VALUE path, char (* cache_path)[MAX_CACHEPATH_SIZE]);
108116
static int bs_read_key(int fd, struct bs_cache_key * key);
109-
static int cache_key_equal(struct bs_cache_key * k1, struct bs_cache_key * k2);
117+
static enum cache_status cache_key_equal_fast_path(struct bs_cache_key * k1, struct bs_cache_key * k2);
118+
static int cache_key_equal_slow_path(struct bs_cache_key * current_key, struct bs_cache_key * cached_key, const VALUE input_data);
119+
static int update_cache_key(struct bs_cache_key *current_key, int cache_fd, const char ** errno_provenance);
120+
121+
static void bs_cache_key_digest(struct bs_cache_key * key, const VALUE input_data);
110122
static VALUE bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args);
111123
static VALUE bs_precompile(char * path, VALUE path_v, char * cache_path, VALUE handler);
112124
static int open_current_file(char * path, struct bs_cache_key * key, const char ** errno_provenance);
@@ -171,6 +183,9 @@ Init_bootsnap(void)
171183
sym_stale = ID2SYM(rb_intern("stale"));
172184
rb_global_variable(&sym_stale);
173185

186+
sym_revalidated = ID2SYM(rb_intern("revalidated"));
187+
rb_global_variable(&sym_revalidated);
188+
174189
rb_define_module_function(rb_mBootsnap, "instrumentation_enabled=", bs_instrumentation_enabled_set, 1);
175190
rb_define_module_function(rb_mBootsnap_CompileCache_Native, "readonly=", bs_readonly_set, 1);
176191
rb_define_module_function(rb_mBootsnap_CompileCache_Native, "coverage_running?", bs_rb_coverage_running, 0);
@@ -189,6 +204,14 @@ bs_instrumentation_enabled_set(VALUE self, VALUE enabled)
189204
return enabled;
190205
}
191206

207+
static inline void
208+
bs_instrumentation(VALUE event, VALUE path)
209+
{
210+
if (RB_UNLIKELY(instrumentation_enabled)) {
211+
rb_funcall(rb_mBootsnap, instrumentation_method, 2, event, path);
212+
}
213+
}
214+
192215
static VALUE
193216
bs_readonly_set(VALUE self, VALUE enabled)
194217
{
@@ -294,17 +317,53 @@ bs_cache_path(const char * cachedir, const VALUE path, char (* cache_path)[MAX_C
294317
* The data_size member is not compared, as it serves more of a "header"
295318
* function.
296319
*/
297-
static int
298-
cache_key_equal(struct bs_cache_key * k1, struct bs_cache_key * k2)
320+
static enum cache_status cache_key_equal_fast_path(struct bs_cache_key *k1,
321+
struct bs_cache_key *k2) {
322+
if (k1->version == k2->version &&
323+
k1->ruby_platform == k2->ruby_platform &&
324+
k1->compile_option == k2->compile_option &&
325+
k1->ruby_revision == k2->ruby_revision && k1->size == k2->size) {
326+
return (k1->mtime == k2->mtime) ? hit : stale;
327+
}
328+
return miss;
329+
}
330+
331+
static int cache_key_equal_slow_path(struct bs_cache_key *current_key,
332+
struct bs_cache_key *cached_key,
333+
const VALUE input_data)
299334
{
300-
return (
301-
k1->version == k2->version &&
302-
k1->ruby_platform == k2->ruby_platform &&
303-
k1->compile_option == k2->compile_option &&
304-
k1->ruby_revision == k2->ruby_revision &&
305-
k1->size == k2->size &&
306-
k1->mtime == k2->mtime
307-
);
335+
bs_cache_key_digest(current_key, input_data);
336+
return current_key->digest == cached_key->digest;
337+
}
338+
339+
static int update_cache_key(struct bs_cache_key *current_key, int cache_fd, const char ** errno_provenance)
340+
{
341+
lseek(cache_fd, 0, SEEK_SET);
342+
ssize_t nwrite = write(cache_fd, current_key, KEY_SIZE);
343+
if (nwrite < 0) {
344+
*errno_provenance = "update_cache_key:write";
345+
return -1;
346+
}
347+
348+
#ifdef HAVE_FDATASYNC
349+
if (fdatasync(cache_fd) < 0) {
350+
*errno_provenance = "update_cache_key:fdatasync";
351+
return -1;
352+
}
353+
#endif
354+
355+
return 0;
356+
}
357+
358+
/*
359+
* Fills the cache key digest.
360+
*/
361+
static void bs_cache_key_digest(struct bs_cache_key *key,
362+
const VALUE input_data) {
363+
if (key->digest_set)
364+
return;
365+
key->digest = fnv1a_64(input_data);
366+
key->digest_set = 1;
308367
}
309368

310369
/*
@@ -393,6 +452,7 @@ open_current_file(char * path, struct bs_cache_key * key, const char ** errno_pr
393452
key->ruby_revision = current_ruby_revision;
394453
key->size = (uint64_t)statbuf.st_size;
395454
key->mtime = (uint64_t)statbuf.st_mtime;
455+
key->digest_set = false;
396456

397457
return fd;
398458
}
@@ -436,7 +496,12 @@ open_cache_file(const char * path, struct bs_cache_key * key, const char ** errn
436496
{
437497
int fd, res;
438498

439-
fd = open(path, O_RDONLY | O_NOATIME);
499+
if (readonly) {
500+
fd = open(path, O_RDONLY | O_NOATIME);
501+
} else {
502+
fd = open(path, O_RDWR | O_NOATIME);
503+
}
504+
440505
if (fd < 0) {
441506
*errno_provenance = "bs_fetch:open_cache_file:open";
442507
return CACHE_MISS;
@@ -681,7 +746,7 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args
681746
int res, valid_cache = 0, exception_tag = 0;
682747
const char * errno_provenance = NULL;
683748

684-
VALUE input_data; /* data read from source file, e.g. YAML or ruby source */
749+
VALUE input_data = Qfalse; /* data read from source file, e.g. YAML or ruby source */
685750
VALUE storage_data; /* compiled data, e.g. msgpack / binary iseq */
686751
VALUE output_data; /* return data, e.g. ruby hash or loaded iseq */
687752

@@ -699,20 +764,43 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args
699764
cache_fd = open_cache_file(cache_path, &cached_key, &errno_provenance);
700765
if (cache_fd == CACHE_MISS || cache_fd == CACHE_STALE) {
701766
/* This is ok: valid_cache remains false, we re-populate it. */
702-
if (RB_UNLIKELY(instrumentation_enabled)) {
703-
rb_funcall(rb_mBootsnap, instrumentation_method, 2, cache_fd == CACHE_MISS ? sym_miss : sym_stale, path_v);
704-
}
767+
bs_instrumentation(cache_fd == CACHE_MISS ? sym_miss : sym_stale, path_v);
705768
} else if (cache_fd < 0) {
706769
exception_message = rb_str_new_cstr(cache_path);
707770
goto fail_errno;
708771
} else {
709772
/* True if the cache existed and no invalidating changes have occurred since
710773
* it was generated. */
711-
valid_cache = cache_key_equal(&current_key, &cached_key);
712-
if (RB_UNLIKELY(instrumentation_enabled)) {
713-
if (!valid_cache) {
714-
rb_funcall(rb_mBootsnap, instrumentation_method, 2, sym_stale, path_v);
774+
775+
switch(cache_key_equal_fast_path(&current_key, &cached_key)) {
776+
case hit:
777+
valid_cache = true;
778+
break;
779+
case miss:
780+
valid_cache = false;
781+
break;
782+
case stale:
783+
valid_cache = false;
784+
if ((input_data = bs_read_contents(current_fd, current_key.size,
785+
&errno_provenance)) == Qfalse) {
786+
exception_message = path_v;
787+
goto fail_errno;
788+
}
789+
valid_cache = cache_key_equal_slow_path(&current_key, &cached_key, input_data);
790+
if (valid_cache) {
791+
if (!readonly) {
792+
if (update_cache_key(&current_key, cache_fd, &errno_provenance)) {
793+
exception_message = path_v;
794+
goto fail_errno;
795+
}
796+
}
797+
bs_instrumentation(sym_revalidated, path_v);
715798
}
799+
break;
800+
};
801+
802+
if (!valid_cache) {
803+
bs_instrumentation(sym_stale, path_v);
716804
}
717805
}
718806

@@ -726,7 +814,7 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args
726814
else if (res == CACHE_UNCOMPILABLE) {
727815
/* If fetch_cached_data returned `Uncompilable` we fallback to `input_to_output`
728816
This happens if we have say, an unsafe YAML cache, but try to load it in safe mode */
729-
if ((input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse){
817+
if (input_data == Qfalse && (input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse) {
730818
exception_message = path_v;
731819
goto fail_errno;
732820
}
@@ -745,7 +833,7 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args
745833
/* Cache is stale, invalid, or missing. Regenerate and write it out. */
746834

747835
/* Read the contents of the source file into a buffer */
748-
if ((input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse){
836+
if (input_data == Qfalse && (input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse) {
749837
exception_message = path_v;
750838
goto fail_errno;
751839
}
@@ -767,6 +855,7 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args
767855
* We do however ignore any failures to persist the cache, as it's better
768856
* to move along, than to interrupt the process.
769857
*/
858+
bs_cache_key_digest(&current_key, input_data);
770859
atomic_write_cache_file(cache_path, &current_key, storage_data, &errno_provenance);
771860

772861
/* Having written the cache, now convert storage_data to output_data */
@@ -822,12 +911,16 @@ bs_fetch(char * path, VALUE path_v, char * cache_path, VALUE handler, VALUE args
822911
static VALUE
823912
bs_precompile(char * path, VALUE path_v, char * cache_path, VALUE handler)
824913
{
914+
if (readonly) {
915+
return Qfalse;
916+
}
917+
825918
struct bs_cache_key cached_key, current_key;
826919
int cache_fd = -1, current_fd = -1;
827920
int res, valid_cache = 0, exception_tag = 0;
828921
const char * errno_provenance = NULL;
829922

830-
VALUE input_data; /* data read from source file, e.g. YAML or ruby source */
923+
VALUE input_data = Qfalse; /* data read from source file, e.g. YAML or ruby source */
831924
VALUE storage_data; /* compiled data, e.g. msgpack / binary iseq */
832925

833926
/* Open the source file and generate a cache key for it */
@@ -843,7 +936,28 @@ bs_precompile(char * path, VALUE path_v, char * cache_path, VALUE handler)
843936
} else {
844937
/* True if the cache existed and no invalidating changes have occurred since
845938
* it was generated. */
846-
valid_cache = cache_key_equal(&current_key, &cached_key);
939+
switch(cache_key_equal_fast_path(&current_key, &cached_key)) {
940+
case hit:
941+
valid_cache = true;
942+
break;
943+
case miss:
944+
valid_cache = false;
945+
break;
946+
case stale:
947+
valid_cache = false;
948+
if ((input_data = bs_read_contents(current_fd, current_key.size, &errno_provenance)) == Qfalse) {
949+
goto fail;
950+
}
951+
valid_cache = cache_key_equal_slow_path(&current_key, &cached_key, input_data);
952+
if (valid_cache) {
953+
if (update_cache_key(&current_key, cache_fd, &errno_provenance)) {
954+
goto fail;
955+
}
956+
957+
bs_instrumentation(sym_revalidated, path_v);
958+
}
959+
break;
960+
};
847961
}
848962

849963
if (valid_cache) {
@@ -870,6 +984,7 @@ bs_precompile(char * path, VALUE path_v, char * cache_path, VALUE handler)
870984
if (!RB_TYPE_P(storage_data, T_STRING)) goto fail;
871985

872986
/* Write the cache key and storage_data to the cache directory */
987+
bs_cache_key_digest(&current_key, input_data);
873988
res = atomic_write_cache_file(cache_path, &current_key, storage_data, &errno_provenance);
874989
if (res < 0) goto fail;
875990

ext/bootsnap/extconf.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
require "mkmf"
44

55
if %w[ruby truffleruby].include?(RUBY_ENGINE)
6+
have_func "fdatasync", "fcntl.h"
7+
68
unless RUBY_PLATFORM.match?(/mswin|mingw|cygwin/)
79
append_cppflags ["_GNU_SOURCE"] # Needed of O_NOATIME
810
end

test/compile_cache_key_format_test.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ class CompileCacheKeyFormatTest < Minitest::Test
2222

2323
def test_key_version
2424
key = cache_key_for_file(FILE)
25-
exp = [4].pack("L")
25+
exp = [5].pack("L")
2626
assert_equal(exp, key[R[:version]])
2727
end
2828

0 commit comments

Comments
 (0)