Skip to content

Commit b6d4561

Browse files
committed
test-tool: offer to generate insanely-large synthetic pack files
To study Git at scale, and to keep it working even for the largest hosted repositories, we need a way to generate pack files containing _many_ objects. This new helper generates such synthetic pack files, fast. To do so, it side-steps most of Git's regular machinery and even avoids the time tax of deflating (and later on inflating) the objects. Instead, it hard-codes objects using the uncompressed format so that objects can be generated _really_ fast. On my laptop this tool can generate a pack with 215 million objects in slightly over 40 seconds. Signed-off-by: Johannes Schindelin <johannes.schindelin@gmx.de>
1 parent 05d862f commit b6d4561

File tree

1 file changed

+78
-0
lines changed

1 file changed

+78
-0
lines changed

t/helper/test-synthesize.c

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "strbuf.h"
1313
#include "parse-options.h"
1414
#include "strmap.h"
15+
#include "hash.h"
1516

1617
/*
1718
* This helper generates artificial repositories. To do so, it uses a
@@ -477,16 +478,93 @@ static int cmd__synthesize__commits(int argc, const char **argv, const char *pre
477478
return 0;
478479
}
479480

481+
static int generate_pack(const char *path, size_t object_count, const struct git_hash_algo *algo)
482+
{
483+
FILE *f = fopen_for_writing(path);
484+
git_hash_ctx ctx;
485+
size_t i, bytes_needed = 0;
486+
unsigned char *counter = (void *)&i;
487+
char buf[1024];
488+
489+
for (i = object_count; i; i >>= 8)
490+
bytes_needed++;
491+
492+
/* Let `counter` point at the relevant bytes of the variable `i` */
493+
i = 1;
494+
if (!*counter)
495+
counter += sizeof(i) - bytes_needed;
496+
497+
algo->init_fn(&ctx);
498+
499+
memcpy(buf, "PACK", 4);
500+
put_be32(buf + 4, 2);
501+
put_be32(buf + 8, object_count);
502+
fwrite(buf, 1, 12, f);
503+
algo->update_fn(&ctx, buf, 12);
504+
505+
buf[0] = 0x30 + bytes_needed; /* always a blob */
506+
/*
507+
* Uncompressed zlib always starts with 0x78 0x01 0x01, followed by two
508+
* bytes encoding the size, little endian, then two bytes with the
509+
* bitwise-complement of that size, then the payload, and then the
510+
* Adler32 checksum.
511+
*/
512+
buf[1] = 0x78;
513+
buf[2] = 0x01;
514+
buf[3] = 0x01;
515+
516+
buf[4] = bytes_needed & 0xff;
517+
buf[5] = (bytes_needed >> 8) & 0xff;
518+
buf[6] = buf[4] ^ 0xff;
519+
buf[7] = buf[5] ^ 0xff;
520+
521+
for (i = 0; i < object_count; i++) {
522+
/* write a non-compressed entry */
523+
memcpy(buf + 8, counter, bytes_needed);
524+
put_be32(buf + 8 + bytes_needed, adler32(1l, counter, bytes_needed));
525+
526+
fwrite(buf, 1, 12 + bytes_needed, f);
527+
algo->update_fn(&ctx, buf, 12 + bytes_needed);
528+
}
529+
530+
algo->final_fn((unsigned char *)buf, &ctx);
531+
fwrite(buf, 1, algo->rawsz, f);
532+
533+
fclose(f);
534+
535+
return 0;
536+
}
537+
538+
static int cmd__synthesize__pack(int argc, const char **argv, const char *prefix UNUSED)
539+
{
540+
const struct git_hash_algo *algo = hash_algos + GIT_HASH_SHA1;
541+
size_t object_count;
542+
const char *path;
543+
int ret;
544+
545+
if (argc != 3)
546+
die("usage: test-tool synthesize pack <object-count> <filename>");
547+
548+
object_count = strtoumax(argv[1], NULL, 10);
549+
path = argv[2];
550+
551+
ret = !!generate_pack(path, object_count, algo);
552+
553+
return ret;
554+
}
555+
480556
int cmd__synthesize(int argc, const char **argv)
481557
{
482558
const char *prefix = NULL;
483559
char const * const synthesize_usage[] = {
484560
"test-tool synthesize commits <options>",
561+
"test-tool synthesize pack <options>",
485562
NULL,
486563
};
487564
parse_opt_subcommand_fn *fn = NULL;
488565
struct option options[] = {
489566
OPT_SUBCOMMAND("commits", &fn, cmd__synthesize__commits),
567+
OPT_SUBCOMMAND("pack", &fn, cmd__synthesize__pack),
490568
OPT_END()
491569
};
492570
argc = parse_options(argc, argv, prefix, options, synthesize_usage, 0);

0 commit comments

Comments
 (0)