Skip to content

Commit 4e54cd4

Browse files
authored
Fix stuck progress bar on "Add nodes" of archive import (#7118)
The removal of the filter_size batching in #6998 caused the progress bar to only update once at the end instead of incrementally. Here, we re-introduce query batching (50k IDs per batch for UX) and added QueryBuilder streaming-based progress updates (every ~10k entities) by replacing the blocking list comprehension with an explicit for-loop. Progress now updates during both query batching (for UX) and result streaming.
1 parent d123ac2 commit 4e54cd4

File tree

1 file changed

+15
-11
lines changed

1 file changed

+15
-11
lines changed

src/aiida/tools/archive/imports.py

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def import_archive(
109109
if not (merge_extras[0] in ['k', 'n'] and merge_extras[1] in ['c', 'n'] and merge_extras[2] in ['l', 'u', 'd']):
110110
raise ValueError('merge_extras contains invalid values')
111111
if merge_comments not in ('leave', 'newest', 'overwrite'):
112-
raise ValueError(f"merge_comments not in {('leave', 'newest', 'overwrite')!r}")
112+
raise ValueError(f'merge_comments not in {("leave", "newest", "overwrite")!r}')
113113
type_check(group, orm.Group, allow_none=True)
114114
type_check(test_run, bool)
115115
backend = backend or get_manager().get_profile_storage()
@@ -234,20 +234,24 @@ def _add_new_entities(
234234
ufields.append(ufield)
235235

236236
with get_progress_reporter()(desc=f'Adding new {etype.value}(s)', total=total) as progress:
237-
rows = [
238-
transform(row)
239-
for row in QueryBuilder(backend=backend_from)
240-
.append(
237+
# For UX: batch large ID lists so queries start returning results faster
238+
# Even though the improved IN clause handles any size, query planning for 500k+ IDs can be slow
239+
query_batch_size = 50_000
240+
241+
# Batch the IDs for querying (UX optimization, not a technical requirement)
242+
for _, ufields_batch in batch_iter(ufields, query_batch_size):
243+
query = QueryBuilder(backend=backend_from).append(
241244
entity_type_to_orm[etype],
242-
filters={unique_field: {'in': ufields}},
245+
filters={unique_field: {'in': ufields_batch}},
243246
project=['**'],
244247
tag='entity',
245248
)
246-
.dict(batch_size=batch_size)
247-
]
248-
new_ids = backend_to.bulk_insert(etype, rows)
249-
backend_unique_id.update({row[unique_field]: pk for pk, row in zip(new_ids, rows)})
250-
progress.update(len(rows))
249+
250+
# Batch the results processing for progress updates and memory efficiency
251+
for nrows, rows_batch in batch_iter(query.dict(batch_size=batch_size), batch_size, transform):
252+
new_ids = backend_to.bulk_insert(etype, rows_batch)
253+
backend_unique_id.update({row[unique_field]: pk for pk, row in zip(new_ids, rows_batch)})
254+
progress.update(nrows)
251255

252256

253257
def _import_users(

0 commit comments

Comments
 (0)