update scraping

davidheineman · Aug 25, 2024 · a336d53 · a336d53
1 parent 97af91e
commit a336d53
Show file tree

Hide file tree

Showing 8 changed files with 93 additions and 41 deletions.
diff --git a/README.md b/README.md
@@ -110,7 +110,9 @@ To see an example of search, visit:
         - Add a dropdown under the "Workshop" box to select specific workshops
 
     - On search quality
+        - Fix ML entries (only keep accepted papers, figure out which years are included)
         - Include the title in the indexing
+        - Can we return more than 128 documents? Currenlty, search-then-filter can return no results, just because the top results were in another category.
 
     - On indexing
         - Make indexing code better 

diff --git a/src/db.py b/src/db.py
@@ -90,6 +90,10 @@ def parse_results(results):
         author  = author.replace("{", "").replace("}", "").replace('\\"', "")
         abstract = abstract.replace("{", "").replace("}", "").replace("\\", "")
 
+        # Parse list of strings
+        import ast
+        author = ast.literal_eval(str(author))
+
         parsed_results[int(pid)] = {
             'title': title, 
             'abstract': abstract, 

diff --git a/src/parse.py b/src/parse.py
@@ -15,31 +15,59 @@ def preprocess_openreview(openreview_path):
     openreview = openreview['conference']
 
     for conf_name, conf_entries in openreview.items():
+        skipped = 0
+
         year = conf_name.split('/')[1]
         for conf_entry in conf_entries:
-            # raise RuntimeError(conf_entry['content'])
+            # raise RuntimeError(conf_entry)
+
             try:
+                bibtex = conf_entry['content']['_bibtex']
+                bibkey = bibtex.split('{')[1].split(',')[0].replace('\n', '')
+
+                venue = conf_entry['content']['venue']
+                venueid = conf_entry['content']['venueid'].split('.cc')[0]
+
+                if 'Submitted' in venue:
+                    venue_type = 'rejected'
+                elif 'notable top 25%' in venue:
+                    venue_type = 'oral'
+                elif 'notable top 5%' in venue:
+                    venue_type = 'spotlight'
+                elif 'Accept' in venue:
+                    venue_type = 'poster'
+                else:
+                    venue_type = venue.split(' ')[2].lower()
+
+                assert venue_type in ['spotlight', 'oral', 'poster', 'invite', 'rejected'], venue_type
+
                 formatted_entry = {
                     'title':    conf_entry['content']['title'],
                     'abstract': conf_entry['content']['abstract'], # some failures
-                    'year':     year,
-                    'url':      'https://openreview.net' + conf_entry['content']['pdf'],
+                    'year':     int(year),
+                    'url':      'https://openreview.net/forum?id=' + conf_entry['id'], # 'forum', 'original'
                     'pdf':      'https://openreview.net' + conf_entry['content']['pdf'],
                     'authors':  conf_entry['content']['authors'],
                     # 'TL;DR':    conf_entry['content']['TL;DR'], # some failures
-                    'venue':    conf_entry['content']['venue'],
-                    'venueid':  conf_entry['content']['venueid'],
-                    '_bibtex':  conf_entry['content']['_bibtex'], # some failures
+                    'venue':    venue,
+                    'venueid':  venueid,
+                    '_bibtex':  bibtex, # some failures
+                    '_bibkey':  bibkey,
 
                     'invitation': conf_entry['invitation'],
 
                     'findings': False,
-                    'venue_type': 'main'
+                    'venue_type': venue_type,
+
+                    'area': 'ml'
                 }
 
                 dataset += [formatted_entry]
             except KeyError as e:
-                print(e)
+                skipped += 1
+                # print(e)
+
+        print(f'Processed {len(conf_entries)-skipped} / {len(conf_entries)} entries for {conf_name}')
 
     return dataset
 

diff --git a/src/scrape/acl.py b/src/scrape/acl.py
@@ -13,7 +13,7 @@
 from tqdm import tqdm
 
 from constants import ANTHOLOGY_PATH
-from anthology import Anthology, Paper
+from anthology import Anthology, Paper, PersonName
 
 
 ANTHOLOGY_RAW_PATH = os.path.join(CURRENT_DIR, 'acl_data')
@@ -33,15 +33,21 @@ def preprocess_acl(anthology_path):
 
         venue_type, is_findings = get_venue_type(year, url)
 
+        authors = [person for person in paper.iter_people()]
+        authors = [author for author, id, type_ in authors]
+
+        for i, author in enumerate(authors):
+            if not isinstance(author, str):
+                authors[i] = str(author)
+
         formatted_entry = {
             'title':    paper.get_title(form='plain'),
             'abstract': paper.get_abstract(form='plain'),
 
             'year':     year,
             'url':      url,
             'pdf':      paper_dict.get('pdf'),
-            'authors':  paper_dict.get('author_string'), # [p for p in paper.iter_people()],
-            # 'venue':    paper_dict['venue'][0], # failures?
+            'authors':  authors,
             'venue':    paper_dict['booktitle'],
             'venueid':  paper.get_venue_acronym(),
             '_bibtex':  paper.as_bibtex(concise=True),
@@ -52,6 +58,8 @@ def preprocess_acl(anthology_path):
             'venue_type': venue_type,
             'findings': is_findings,
 
+            'area': 'nlp'
+
             # 'TL;DR':    None,
         }
 

diff --git a/src/scrape/openrev.py b/src/scrape/openrev.py
@@ -96,16 +96,24 @@ def download_openreview(openreview_path):
     groups = ["conference"]
 
     venues = get_venues(client, conferences, years)
+
     print(venues)
+
     grouped_venues = group_venues(venues, groups)
+
     print(grouped_venues)
+
     papers = get_papers(client, grouped_venues, only_accepted)
 
     for i, t in enumerate(papers):
         for j, c in enumerate(papers[t]):
             for k, p in enumerate(papers[t][c]):
                 papers[t][c][k] = p.to_json()
 
+    # papers['conference'] = papers.get('conference', []) + papers.get('Conference', [])
+    # if 'Conference' in papers:
+    #     del papers['Conference']
+
     os.makedirs(os.path.dirname(openreview_path), exist_ok=True)
     with open(openreview_path, "w", encoding="utf-8") as json_file:
         json.dump(papers, json_file, indent=4)

diff --git a/src/search.py b/src/search.py
@@ -17,8 +17,8 @@
 
 
 NCELLS = 1  # Number of self.centroids to use in PLAID
-CENTROID_SCORE_THRESHOLD = 0.5 # How close a document has to be to a centroid to be considered
-NDOCS = 512  # Number of closest documents to consider
+CENTROID_SCORE_THRESHOLD = 0.8 # 0.5  # How close a document has to be to a centroid to be considered
+NDOCS = 8192 # 512  # Number of closest documents to consider
 
 
 class ColBERT():

diff --git a/src/server.py b/src/server.py
@@ -79,7 +79,7 @@ def query():
         is_findings=is_findings
     )
 
-    K = 20
+    K = 1000
     server_response = server_response[:K]
 
     return server_response
@@ -108,4 +108,4 @@ def index():
     colbert = ColBERT(index_path=INDEX_PATH)
     print(colbert.search('text simplificaiton'))
     print(api_search_query("text simplification")['topk'][:5])
-    app.run("0.0.0.0", PORT) # debug=True
+    app.run("0.0.0.0", PORT, debug=True)
diff --git a/src/templates/index.html b/src/templates/index.html
@@ -33,7 +33,7 @@ <h1 class="mb-3">papers</h1>
                     <div class="mb-3">
                         <div class="form-check">
                             <input class="form-check-input" type="checkbox" value="nlp" id="nlp" checked>
-                            <label class="form-check-label" for="nlp">ACL</label>
+                            <label class="form-check-label" for="nlp">ACL/EMNLP</label>
                         </div>
                         <div class="ml-4">
                             <div class="form-check">
@@ -73,12 +73,28 @@ <h1 class="mb-3">papers</h1>
                     <div class="mb-3">
                         <div class="form-check">
                             <input class="form-check-input" type="checkbox" value="ml" id="ml">
-                            <label class="form-check-label" for="ml">ML</label>
+                            <label class="form-check-label" for="ml">NeurIPS/ICLR/ICML</label>
                         </div>
                         <div class="ml-4">
                             <div class="form-check">
-                                <input class="form-check-input ml-checkbox" type="checkbox" value="main" id="ml-main">
-                                <label class="form-check-label" for="ml-main">Conference</label>
+                                <input class="form-check-input ml-checkbox" type="checkbox" value="spotlight" id="ml-spotlight">
+                                <label class="form-check-label" for="ml-spotlight">Spotlight</label>
+                            </div>
+                            <div class="form-check">
+                                <input class="form-check-input ml-checkbox" type="checkbox" value="oral" id="ml-oral">
+                                <label class="form-check-label" for="ml-oral">Oral</label>
+                            </div>
+                            <div class="form-check">
+                                <input class="form-check-input ml-checkbox" type="checkbox" value="poster" id="ml-poster">
+                                <label class="form-check-label" for="ml-poster">Poster</label>
+                            </div>
+                            <div class="form-check">
+                                <input class="form-check-input ml-checkbox" type="checkbox" value="invite" id="ml-invite">
+                                <label class="form-check-label" for="ml-invite">Invite</label>
+                            </div>
+                            <div class="form-check">
+                                <input class="form-check-input ml-checkbox" type="checkbox" value="rejected" id="ml-rejected">
+                                <label class="form-check-label" for="ml-rejected">Rejected</label>
                             </div>
                         </div>
                     </div>
@@ -154,14 +170,21 @@ <h1 class="mb-3">papers</h1>
             });
         });
 
-
         document.getElementById('ml').addEventListener('change', function() {
             var isChecked = this.checked;
             document.querySelectorAll('.ml-checkbox').forEach(function(checkbox) {
                 checkbox.checked = isChecked;
             });
         });
 
+        document.querySelectorAll('.ml-checkbox').forEach(function(checkbox) {
+            checkbox.addEventListener('change', function() {
+                const allMlCheckboxes = document.querySelectorAll('.ml-checkbox');
+                const allUnchecked = Array.from(allMlCheckboxes).every(cb => !cb.checked);
+                document.getElementById('ml').checked = !allUnchecked;
+            });
+        });
+
         function displayResults(data) {
             const resultsDiv = document.getElementById('results');
             if (data.length === 0) {
@@ -173,18 +196,6 @@ <h1 class="mb-3">papers</h1>
 
             let html = '';
             data.forEach(paper => {
-                // html += `
-                //     <div class="card">
-                //         <div class="card-body">
-                //             <h5 class="card-title"><a href="${paper.url}" target="_blank">${paper.title}</a></h5>
-                //             <h6 class="card-subtitle mb-1 text-muted">${paper.author}</h6>
-                //             <p class="card-subtitle text-muted paper-metadata">
-                //                 <strong>${paper.year} / ${paper.venueid} / ${paper.venue_type} </strong> <br>
-                //             </p>
-                //         </div>
-                //     </div>
-                // `;
-
                 paper.author = Array.isArray(paper.author) ? paper.author : [paper.author];
 
                 html += `
@@ -255,16 +266,7 @@ <h6 class="card-subtitle mb-1 text-light-authors">${paper.author.join(', ')}</h6
 
             const abstractElement = document.querySelector(`#abstract-${paperId}`);
             if (abstractElement) {
-                // Toggle visibility if the abstract element already exists
                 abstractElement.style.display = abstractElement.style.display === 'none' ? 'block' : 'none';
-            } else {
-                // Create and insert the abstract element if it doesn't exist
-                // const cardBody = document.querySelector(`#paper-${paperId} .card-body`);
-                // const abstractParagraph = document.createElement('p');
-                // abstractParagraph.id = `abstract-${paperId}`;
-                // abstractParagraph.className = 'card-text mt-2';
-                // abstractParagraph.innerHTML = `<small class="text-muted">${paper.abstract}</small>`;
-                // cardBody.appendChild(abstractParagraph);
             }
         }