Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
140d075
[file_packager] split data files when files exceeds configured limit
arsnyder16 Jul 29, 2025
505d26a
comment
arsnyder16 Jul 29, 2025
11ec397
whitespace
arsnyder16 Jul 29, 2025
d36ab45
static limit
arsnyder16 Jul 31, 2025
c2d0ba7
Merge remote-tracking branch 'origin/main' into asnyder/split_large_p…
arsnyder16 Jul 31, 2025
a7ac251
only if exclusively preload
arsnyder16 Jul 31, 2025
dfe6838
cleanup
arsnyder16 Jul 31, 2025
f73f613
ruff
arsnyder16 Jul 31, 2025
5ad1003
Merge branch 'emscripten-core:main' into asnyder/split_large_packages
arsnyder16 Aug 4, 2025
15d50ee
add tests
arsnyder16 Aug 10, 2025
30421d5
Merge branch 'main' into asnyder/split_large_packages
arsnyder16 Aug 10, 2025
95b5f33
rev
arsnyder16 Aug 10, 2025
f495602
ruff
arsnyder16 Aug 10, 2025
1daf7d4
no needed
arsnyder16 Aug 10, 2025
bffa305
remove debugging
arsnyder16 Aug 10, 2025
e6217c0
Merge branch 'main' into asnyder/split_large_packages
arsnyder16 Aug 18, 2025
c5d44a2
test perf
arsnyder16 Aug 18, 2025
6d4fe7c
ruff
arsnyder16 Aug 18, 2025
b04d3ed
2046 limit
arsnyder16 Aug 19, 2025
9ccb2ed
Merge branch 'main' into asnyder/split_large_packages
arsnyder16 Aug 19, 2025
f1a88f1
Merge branch 'main' into asnyder/split_large_packages
arsnyder16 Aug 19, 2025
e7be18a
Merge remote-tracking branch 'origin/main' into asnyder/split_large_p…
arsnyder16 Aug 22, 2025
9411cb6
Merge branch 'asnyder/split_large_packages' of https://github.com/ars…
arsnyder16 Aug 22, 2025
4edef4d
remove
arsnyder16 Aug 22, 2025
0e645d5
Merge remote-tracking branch 'origin/main' into asnyder/split_large_p…
arsnyder16 Sep 17, 2025
04f04ad
fix
arsnyder16 Sep 17, 2025
7737645
try to reduce memory usage
arsnyder16 Sep 17, 2025
759062c
PR feedback
arsnyder16 Sep 17, 2025
0ad75b8
ruff
arsnyder16 Sep 17, 2025
36c2a2b
test loading
arsnyder16 Sep 18, 2025
f268944
ruff
arsnyder16 Sep 18, 2025
2a72cd0
Merge remote-tracking branch 'origin/main' into asnyder/split_large_p…
arsnyder16 Feb 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
158 changes: 157 additions & 1 deletion test/test_other.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,12 @@ def run_on_pty(self, cmd):
os.close(master)
os.close(slave)

def create_huge_file(self, name, length):
f = open(name,"wb")
f.seek(length - 1)
f.write(b"\0")
f.close()

# Test that running `emcc -v` always works even in the presence of `EMCC_CFLAGS`.
# This needs to work because many tools run `emcc -v` internally and it should
# always work even if the user has `EMCC_CFLAGS` set.
Expand Down Expand Up @@ -6381,14 +6387,164 @@ def test_underscore_exit(self):

def test_file_packager_huge(self):
MESSAGE = 'warning: file packager is creating an asset bundle of 257 MB. this is very large, and browsers might have trouble loading it'
create_file('huge.dat', 'a' * (1024 * 1024 * 257))
self.create_huge_file('huge.dat', 1024 * 1024 * 257)
create_file('tiny.dat', 'a')
err = self.run_process([FILE_PACKAGER, 'test.data', '--preload', 'tiny.dat'], stdout=PIPE, stderr=PIPE).stderr
self.assertNotContained(MESSAGE, err)
err = self.run_process([FILE_PACKAGER, 'test.data', '--preload', 'huge.dat'], stdout=PIPE, stderr=PIPE).stderr
self.assertContained(MESSAGE, err)
self.clear()

def test_file_packager_huge_no_split(self):
# Verify that when packaging up to 2046MB of data, file packager should not split up the generated data file.
for i in range(7):
self.create_huge_file(f'huge{i}.dat', 1024 * 1024 * 256)
self.create_huge_file('huge7.dat', 1024 * 1024 * 254)
result = self.run_process([FILE_PACKAGER, 'test.data', '--preload', 'huge7.dat'] + [f'huge{i}.dat' for i in range(7)], stdout=PIPE, stderr=PIPE)
self.assertContained('warning: file packager is creating an asset bundle of 2046 MB. this is very large, and browsers might have trouble loading it', result.stderr)
self.assertExists('test.data')
self.assertNotExists('test_1.data')
self.assertEqual(os.path.getsize('test.data'), (1024 * 1024 * 1024) + (1022 * 1024 * 1024))
create_file('load.js', result.stdout)
create_file('src.c', r'''
#include <assert.h>
#include <sys/stat.h>
#include <stdio.h>

int main() {
struct stat buf;
assert(stat("huge0.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge1.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge2.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge3.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge4.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge5.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge6.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge7.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 254);
printf("done\n");
return 0;
}
''')
self.do_runf('src.c', cflags=['--pre-js=load.js', '-sFORCE_FILESYSTEM'])
self.clear()

def test_file_packager_huge_split(self):
# Verify that when size exceeds 2046MB, file packager should split up the generated data file into two.
for i in range(7):
self.create_huge_file(f'huge{i}.dat', 1024 * 1024 * 256)
self.create_huge_file('huge7.dat', (1024 * 1024 * 254) + 1)
result = self.run_process([FILE_PACKAGER, 'test.data', '--preload', 'huge7.dat'] + [f'huge{i}.dat' for i in range(7)], stdout=PIPE, stderr=PIPE)
self.assertContained('warning: file packager is creating an asset bundle of 1792 MB. this is very large, and browsers might have trouble loading it', result.stderr)
self.assertContained('warning: file packager is splitting bundle into 2 chunks', result.stderr)
self.assertExists('test.data')
self.assertExists('test_1.data')
self.assertEqual(os.path.getsize('test.data'), (1024 * 1024 * 256) * 7)
self.assertEqual(os.path.getsize('test_1.data'), (1024 * 1024 * 254) + 1)
create_file('load.js', result.stdout)
create_file('src.c', r'''
#include <assert.h>
#include <sys/stat.h>
#include <stdio.h>

int main() {
struct stat buf;
assert(stat("huge0.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge1.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge2.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge3.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge4.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge5.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge6.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge7.dat", &buf) == 0);
assert(buf.st_size == (1024 * 1024 * 254) + 1);
printf("done\n");
return 0;
}
''')
self.do_runf('src.c', cflags=['--pre-js=load.js', '-sFORCE_FILESYSTEM'])
self.clear()

def test_file_packager_huge_split_metadata(self):
# Verify that when size exceeds 2046MB, file packager should split up the generated data and metadata file into two.
large_size = 1024 * 1024 * 256
final_size = (1024 * 1024 * 254) + 1
for i in range(7):
self.create_huge_file(f'huge{i}.dat', large_size)
self.create_huge_file('huge7.dat', final_size)
err = self.run_process([FILE_PACKAGER, 'test.data', '--separate-metadata', '--js-output=immutable.js', '--preload', 'huge7.dat'] + [f'huge{i}.dat' for i in range(7)], stdout=PIPE, stderr=PIPE).stderr
self.assertContained('warning: file packager is creating an asset bundle of 1792 MB. this is very large, and browsers might have trouble loading it', err)
self.assertContained('warning: file packager is splitting bundle into 2 chunks', err)
self.assertExists('test.data')
self.assertExists('immutable.js')
self.assertExists('immutable_1.js')
self.assertExists('test_1.data')
self.assertEqual(os.path.getsize('test.data'), (1024 * 1024 * 256) * 7)
self.assertEqual(os.path.getsize('test_1.data'), (1024 * 1024 * 254) + 1)

self.assertExists('immutable.js.metadata')
metadata = json.loads(read_file('immutable.js.metadata'))
self.assertEqual(len(metadata['files']), 7)
for i in range(7):
assert metadata['files'][i]['start'] == i * large_size and metadata['files'][i]['end'] == (i * large_size) + large_size and metadata['files'][i]['filename'] == f'/huge{i}.dat'
assert metadata['remote_package_size'] == 7 * large_size

self.assertExists('immutable_1.js.metadata')
metadata = json.loads(read_file('immutable_1.js.metadata'))
self.assertEqual(len(metadata['files']), 1)
assert metadata['files'][0]['start'] == 0 and metadata['files'][0]['end'] == final_size and metadata['files'][0]['filename'] == '/huge7.dat'
assert metadata['remote_package_size'] == final_size
create_file('src.c', r'''
#include <assert.h>
#include <sys/stat.h>
#include <stdio.h>

int main() {
struct stat buf;
assert(stat("huge0.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge1.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge2.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge3.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge4.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge5.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge6.dat", &buf) == 0);
assert(buf.st_size == 1024 * 1024 * 256);
assert(stat("huge7.dat", &buf) == 0);
assert(buf.st_size == (1024 * 1024 * 254) + 1);
printf("done\n");
return 0;
}
''')
self.do_runf('src.c', cflags=['--pre-js=immutable.js', '--pre-js=immutable_1.js', '-sFORCE_FILESYSTEM'])
self.clear()

def test_file_packager_huge_split_too_large(self):
self.create_huge_file('huge.dat', (1024 * 1024 * 1024) + ((1022 * 1024 * 1024) + 1))
proc = self.run_process([FILE_PACKAGER, 'test.data', '--preload', 'huge.dat'], check=False, stdout=PIPE, stderr=PIPE)
self.assertEqual(proc.returncode, 1)
self.assertContained('error: cannot package file huge.dat, which is larger than maximum individual file size limit 2046 MB', proc.stderr)
self.clear()
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like the above tests verify that file packager does split up files, though there is no functional test to verify that e.g. the bytes were split appropriately, and that the final end result loads up properly? Would that be important to test?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure how critical it is, looks like all the file_packager logic is part of these unit tests, seems like out of scope for this PR to add tests that verify the loading of the file_packager results

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are many tests that verify load of file packager results. Basically any test that uses emcc with --preload-file is verifying this.

There are also tests that call FILE_PACKGER directly and then execute the result. See test_file_packager_separate_metadata for example.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sbc100 Thanks for pointing out these decks, test_file_packager_separate_metadata was a good example to follow to test this logic

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sbc100 @juj I updated to verify the package(s) load but on CircleCI they seem to be crashing immediately. I suspect OOMKilled. They run fine for me locally

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@juj @sbc100 Any guidance here?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@juj @sbc100 Could use some guidance here to get this through the finish line, now that i addressed the feedback to actually load the results in these tests they cause the circleci processes to run out of memory.


@parameterized({
'': (True,),
'wasm2js': (False,),
Expand Down
84 changes: 51 additions & 33 deletions tools/file_packager.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,9 @@

DEBUG = os.environ.get('EMCC_DEBUG')

# chrome limit is 2MB under 2Gi
PRELOAD_DATA_FILE_LIMIT = 2046 * 1024 * 1024

excluded_patterns: list[str] = []
new_data_files = []
walked = []
Expand Down Expand Up @@ -537,13 +540,53 @@ def was_seen(name):
data_files = sorted(data_files, key=lambda file_: file_.dstpath)
data_files = [file_ for file_ in data_files if not was_seen(file_.dstpath)]

targets = []
if options.obj_output:
if not options.has_embedded:
diagnostics.error('--obj-output is only applicable when embedding files')
targets.append(options.obj_output)
generate_object_file(data_files)
else:
file_chunks = [data_files]
file_chunks = [[]]
current_size = 0
for file_ in data_files:
fsize = os.path.getsize(file_.srcpath)
if current_size + fsize <= PRELOAD_DATA_FILE_LIMIT:
file_chunks[-1].append(file_)
current_size += fsize
elif fsize > PRELOAD_DATA_FILE_LIMIT:
diagnostics.error('cannot package file %s, which is larger than maximum individual file size limit %d MB.' % (file_.srcpath, (PRELOAD_DATA_FILE_LIMIT / (1024 * 1024))))
return 1
else:
current_size = fsize
file_chunks.append([file_])

if len(file_chunks) > 1:
diagnostics.warn('warning: file packager is splitting bundle into %d chunks' % len(file_chunks))

for counter, data_files in enumerate(file_chunks):
metadata = {'files': []}

def construct_data_file_name(base,ext):
return f"{base}{f'_{counter}' if counter else ''}.{ext}"
data_file = construct_data_file_name(*data_target.rsplit('.', 1))
js_file = None if options.jsoutput is None else construct_data_file_name(*options.jsoutput.rsplit('.', 1))
targets.append(data_file)
ret = generate_preload_js(data_file, data_files, metadata, js_file)
if options.force or len(data_files):
if options.jsoutput is None:
print(ret)
else:
# Overwrite the old jsoutput file (if exists) only when its content
# differs from the current generated one, otherwise leave the file
# untouched preserving its old timestamp
targets.append(js_file)
if ret != (utils.read_file(js_file) if os.path.isfile(js_file) else ''):
utils.write_file(js_file, ret)
if options.separate_metadata:
utils.write_file(js_file + '.metadata', json.dumps(metadata, separators=(',', ':')))
if options.depfile:
targets = []
if options.obj_output:
targets.append(options.obj_output)
if options.jsoutput:
targets.append(data_target)
targets.append(options.jsoutput)
with open(options.depfile, 'w') as f:
for target in targets:
if target:
Expand All @@ -554,31 +597,6 @@ def was_seen(name):
f.write(escape_for_makefile(dependency))
f.write(' \\\n')

if options.obj_output:
if not options.has_embedded:
diagnostics.error('--obj-output is only applicable when embedding files')
generate_object_file(data_files)
else:
metadata = {'files': []}

ret = generate_preload_js(data_target, data_files, metadata)

if options.force or data_files:
if options.jsoutput is None:
print(ret)
else:
# Overwrite the old jsoutput file (if exists) only when its content
# differs from the current generated one, otherwise leave the file
# untouched preserving its old timestamp
if os.path.isfile(options.jsoutput):
old = utils.read_file(options.jsoutput)
if old != ret:
utils.write_file(options.jsoutput, ret)
else:
utils.write_file(options.jsoutput, ret)
if options.separate_metadata:
utils.write_file(options.jsoutput + '.metadata', json.dumps(metadata, separators=(',', ':')))

return 0


Expand All @@ -590,7 +608,7 @@ def escape_for_makefile(fpath):
return fpath.replace('$', '$$').replace('#', '\\#').replace(' ', '\\ ')


def generate_preload_js(data_target, data_files, metadata):
def generate_preload_js(data_target, data_files, metadata, js_file):
# emcc will add this to the output itself, so it is only needed for
# standalone calls
if options.from_emcc:
Expand Down Expand Up @@ -1077,7 +1095,7 @@ def generate_preload_js(data_target, data_files, metadata):
} else {
if (!Module['preRun']) Module['preRun'] = [];
Module['preRun'].push(runMetaWithFS);
}\n''' % {'node_support_code': node_support_code, 'metadata_file': os.path.basename(options.jsoutput + '.metadata')}
}\n''' % {'node_support_code': node_support_code, 'metadata_file': os.path.basename(js_file + '.metadata')}
else:
ret += '''
}
Expand Down