Skip to content

Commit

Permalink
update scraping
Browse files Browse the repository at this point in the history
  • Loading branch information
davidheineman committed Aug 25, 2024
1 parent 97af91e commit a336d53
Show file tree
Hide file tree
Showing 8 changed files with 93 additions and 41 deletions.
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,9 @@ To see an example of search, visit:
- Add a dropdown under the "Workshop" box to select specific workshops
- On search quality
- Fix ML entries (only keep accepted papers, figure out which years are included)
- Include the title in the indexing
- Can we return more than 128 documents? Currenlty, search-then-filter can return no results, just because the top results were in another category.
- On indexing
- Make indexing code better
Expand Down
4 changes: 4 additions & 0 deletions src/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ def parse_results(results):
author = author.replace("{", "").replace("}", "").replace('\\"', "")
abstract = abstract.replace("{", "").replace("}", "").replace("\\", "")

# Parse list of strings
import ast
author = ast.literal_eval(str(author))

parsed_results[int(pid)] = {
'title': title,
'abstract': abstract,
Expand Down
44 changes: 36 additions & 8 deletions src/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,31 +15,59 @@ def preprocess_openreview(openreview_path):
openreview = openreview['conference']

for conf_name, conf_entries in openreview.items():
skipped = 0

year = conf_name.split('/')[1]
for conf_entry in conf_entries:
# raise RuntimeError(conf_entry['content'])
# raise RuntimeError(conf_entry)

try:
bibtex = conf_entry['content']['_bibtex']
bibkey = bibtex.split('{')[1].split(',')[0].replace('\n', '')

venue = conf_entry['content']['venue']
venueid = conf_entry['content']['venueid'].split('.cc')[0]

if 'Submitted' in venue:
venue_type = 'rejected'
elif 'notable top 25%' in venue:
venue_type = 'oral'
elif 'notable top 5%' in venue:
venue_type = 'spotlight'
elif 'Accept' in venue:
venue_type = 'poster'
else:
venue_type = venue.split(' ')[2].lower()

assert venue_type in ['spotlight', 'oral', 'poster', 'invite', 'rejected'], venue_type

formatted_entry = {
'title': conf_entry['content']['title'],
'abstract': conf_entry['content']['abstract'], # some failures
'year': year,
'url': 'https://openreview.net' + conf_entry['content']['pdf'],
'year': int(year),
'url': 'https://openreview.net/forum?id=' + conf_entry['id'], # 'forum', 'original'
'pdf': 'https://openreview.net' + conf_entry['content']['pdf'],
'authors': conf_entry['content']['authors'],
# 'TL;DR': conf_entry['content']['TL;DR'], # some failures
'venue': conf_entry['content']['venue'],
'venueid': conf_entry['content']['venueid'],
'_bibtex': conf_entry['content']['_bibtex'], # some failures
'venue': venue,
'venueid': venueid,
'_bibtex': bibtex, # some failures
'_bibkey': bibkey,

'invitation': conf_entry['invitation'],

'findings': False,
'venue_type': 'main'
'venue_type': venue_type,

'area': 'ml'
}

dataset += [formatted_entry]
except KeyError as e:
print(e)
skipped += 1
# print(e)

print(f'Processed {len(conf_entries)-skipped} / {len(conf_entries)} entries for {conf_name}')

return dataset

Expand Down
14 changes: 11 additions & 3 deletions src/scrape/acl.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from tqdm import tqdm

from constants import ANTHOLOGY_PATH
from anthology import Anthology, Paper
from anthology import Anthology, Paper, PersonName


ANTHOLOGY_RAW_PATH = os.path.join(CURRENT_DIR, 'acl_data')
Expand All @@ -33,15 +33,21 @@ def preprocess_acl(anthology_path):

venue_type, is_findings = get_venue_type(year, url)

authors = [person for person in paper.iter_people()]
authors = [author for author, id, type_ in authors]

for i, author in enumerate(authors):
if not isinstance(author, str):
authors[i] = str(author)

formatted_entry = {
'title': paper.get_title(form='plain'),
'abstract': paper.get_abstract(form='plain'),

'year': year,
'url': url,
'pdf': paper_dict.get('pdf'),
'authors': paper_dict.get('author_string'), # [p for p in paper.iter_people()],
# 'venue': paper_dict['venue'][0], # failures?
'authors': authors,
'venue': paper_dict['booktitle'],
'venueid': paper.get_venue_acronym(),
'_bibtex': paper.as_bibtex(concise=True),
Expand All @@ -52,6 +58,8 @@ def preprocess_acl(anthology_path):
'venue_type': venue_type,
'findings': is_findings,

'area': 'nlp'

# 'TL;DR': None,
}

Expand Down
8 changes: 8 additions & 0 deletions src/scrape/openrev.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,16 +96,24 @@ def download_openreview(openreview_path):
groups = ["conference"]

venues = get_venues(client, conferences, years)

print(venues)

grouped_venues = group_venues(venues, groups)

print(grouped_venues)

papers = get_papers(client, grouped_venues, only_accepted)

for i, t in enumerate(papers):
for j, c in enumerate(papers[t]):
for k, p in enumerate(papers[t][c]):
papers[t][c][k] = p.to_json()

# papers['conference'] = papers.get('conference', []) + papers.get('Conference', [])
# if 'Conference' in papers:
# del papers['Conference']

os.makedirs(os.path.dirname(openreview_path), exist_ok=True)
with open(openreview_path, "w", encoding="utf-8") as json_file:
json.dump(papers, json_file, indent=4)
Expand Down
4 changes: 2 additions & 2 deletions src/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@


NCELLS = 1 # Number of self.centroids to use in PLAID
CENTROID_SCORE_THRESHOLD = 0.5 # How close a document has to be to a centroid to be considered
NDOCS = 512 # Number of closest documents to consider
CENTROID_SCORE_THRESHOLD = 0.8 # 0.5 # How close a document has to be to a centroid to be considered
NDOCS = 8192 # 512 # Number of closest documents to consider


class ColBERT():
Expand Down
4 changes: 2 additions & 2 deletions src/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def query():
is_findings=is_findings
)

K = 20
K = 1000
server_response = server_response[:K]

return server_response
Expand Down Expand Up @@ -108,4 +108,4 @@ def index():
colbert = ColBERT(index_path=INDEX_PATH)
print(colbert.search('text simplificaiton'))
print(api_search_query("text simplification")['topk'][:5])
app.run("0.0.0.0", PORT) # debug=True
app.run("0.0.0.0", PORT, debug=True)
54 changes: 28 additions & 26 deletions src/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ <h1 class="mb-3">papers</h1>
<div class="mb-3">
<div class="form-check">
<input class="form-check-input" type="checkbox" value="nlp" id="nlp" checked>
<label class="form-check-label" for="nlp">ACL</label>
<label class="form-check-label" for="nlp">ACL/EMNLP</label>
</div>
<div class="ml-4">
<div class="form-check">
Expand Down Expand Up @@ -73,12 +73,28 @@ <h1 class="mb-3">papers</h1>
<div class="mb-3">
<div class="form-check">
<input class="form-check-input" type="checkbox" value="ml" id="ml">
<label class="form-check-label" for="ml">ML</label>
<label class="form-check-label" for="ml">NeurIPS/ICLR/ICML</label>
</div>
<div class="ml-4">
<div class="form-check">
<input class="form-check-input ml-checkbox" type="checkbox" value="main" id="ml-main">
<label class="form-check-label" for="ml-main">Conference</label>
<input class="form-check-input ml-checkbox" type="checkbox" value="spotlight" id="ml-spotlight">
<label class="form-check-label" for="ml-spotlight">Spotlight</label>
</div>
<div class="form-check">
<input class="form-check-input ml-checkbox" type="checkbox" value="oral" id="ml-oral">
<label class="form-check-label" for="ml-oral">Oral</label>
</div>
<div class="form-check">
<input class="form-check-input ml-checkbox" type="checkbox" value="poster" id="ml-poster">
<label class="form-check-label" for="ml-poster">Poster</label>
</div>
<div class="form-check">
<input class="form-check-input ml-checkbox" type="checkbox" value="invite" id="ml-invite">
<label class="form-check-label" for="ml-invite">Invite</label>
</div>
<div class="form-check">
<input class="form-check-input ml-checkbox" type="checkbox" value="rejected" id="ml-rejected">
<label class="form-check-label" for="ml-rejected">Rejected</label>
</div>
</div>
</div>
Expand Down Expand Up @@ -154,14 +170,21 @@ <h1 class="mb-3">papers</h1>
});
});


document.getElementById('ml').addEventListener('change', function() {
var isChecked = this.checked;
document.querySelectorAll('.ml-checkbox').forEach(function(checkbox) {
checkbox.checked = isChecked;
});
});

document.querySelectorAll('.ml-checkbox').forEach(function(checkbox) {
checkbox.addEventListener('change', function() {
const allMlCheckboxes = document.querySelectorAll('.ml-checkbox');
const allUnchecked = Array.from(allMlCheckboxes).every(cb => !cb.checked);
document.getElementById('ml').checked = !allUnchecked;
});
});

function displayResults(data) {
const resultsDiv = document.getElementById('results');
if (data.length === 0) {
Expand All @@ -173,18 +196,6 @@ <h1 class="mb-3">papers</h1>

let html = '';
data.forEach(paper => {
// html += `
// <div class="card">
// <div class="card-body">
// <h5 class="card-title"><a href="${paper.url}" target="_blank">${paper.title}</a></h5>
// <h6 class="card-subtitle mb-1 text-muted">${paper.author}</h6>
// <p class="card-subtitle text-muted paper-metadata">
// <strong>${paper.year} / ${paper.venueid} / ${paper.venue_type} </strong> <br>
// </p>
// </div>
// </div>
// `;

paper.author = Array.isArray(paper.author) ? paper.author : [paper.author];

html += `
Expand Down Expand Up @@ -255,16 +266,7 @@ <h6 class="card-subtitle mb-1 text-light-authors">${paper.author.join(', ')}</h6

const abstractElement = document.querySelector(`#abstract-${paperId}`);
if (abstractElement) {
// Toggle visibility if the abstract element already exists
abstractElement.style.display = abstractElement.style.display === 'none' ? 'block' : 'none';
} else {
// Create and insert the abstract element if it doesn't exist
// const cardBody = document.querySelector(`#paper-${paperId} .card-body`);
// const abstractParagraph = document.createElement('p');
// abstractParagraph.id = `abstract-${paperId}`;
// abstractParagraph.className = 'card-text mt-2';
// abstractParagraph.innerHTML = `<small class="text-muted">${paper.abstract}</small>`;
// cardBody.appendChild(abstractParagraph);
}
}

Expand Down

0 comments on commit a336d53

Please sign in to comment.