Skip to content

Commit

Permalink
fix index w abstract
Browse files Browse the repository at this point in the history
  • Loading branch information
davidheineman committed Aug 26, 2024
1 parent a03a818 commit 01b7c06
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 1 deletion.
3 changes: 2 additions & 1 deletion src/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ def main():
# dataset = dataset[:5000] # 5K in 48s/iter on 2 A40s (67K in 2hr)

# Get the abstracts + titles for indexing
collection = [e['title'] + '\n\n' + e['abstract'] for e in dataset]
collection = [e.get('title', '') + '\n\n' + e.get('abstract', '') for e in dataset]
assert all(len(c) > 2 for c in collection)

# Run ColBERT indexer
index_anthology(collection, index_name=INDEX_NAME)
Expand Down
4 changes: 4 additions & 0 deletions src/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,10 @@ def main():
download_openreview(OPENREVIEW_PATH)
dataset += preprocess_openreview(OPENREVIEW_PATH)

for paper in dataset:
if paper['abstract'] == None: paper['abstract'] = ''
if paper['title'] == None: paper['title'] = ''

# Unfortunately, remove papers without abstracts and titles
dataset = [paper for paper in dataset if (paper['abstract'] != '' or paper['title'] != '')]

Expand Down

0 comments on commit 01b7c06

Please sign in to comment.