trigaten
diff --git a/‎.gitignore
Lines changed: 2 additions & 0 deletions b/‎.gitignore
Lines changed: 2 additions & 0 deletions
diff --git a/‎requirements.txt
Lines changed: 1 addition & 0 deletions b/‎requirements.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎scripts/download_and_process_pdfs.py
Lines changed: 1 addition & 1 deletion b/‎scripts/download_and_process_pdfs.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/download_data_pipeline.ipynb
Lines changed: 2 additions & 2 deletions b/‎scripts/download_data_pipeline.ipynb
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/prompt_systematic_review/pipeline.py
Lines changed: 1 addition & 0 deletions b/‎src/prompt_systematic_review/pipeline.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎topic-model/README.md
Lines changed: 58 additions & 0 deletions b/‎topic-model/README.md
Lines changed: 58 additions & 0 deletions
diff --git a/‎topic-model/detected-phrases/params.json
Lines changed: 22 additions & 0 deletions b/‎topic-model/detected-phrases/params.json
Lines changed: 22 additions & 0 deletions
diff --git a/‎topic-model/detected-phrases/phrases.json
Lines changed: 183 additions & 0 deletions b/‎topic-model/detected-phrases/phrases.json
Lines changed: 183 additions & 0 deletions
@@ -14,5 +14,7 @@ scripts/arxiv_papers_with_ai_labels.csv
 papers_output/*
 data/arxiv_papers_for_human_review.csv
 papers
+scripts/master_papers.csv
+scripts/t.py
 /RP_eval_results_*.json
 scripts/master_papers.csv
@@ -15,5 +15,6 @@ tika
 tqdm
 openai
 load_dotenv
+tomotopy
 wordcloud
 -e .
@@ -81,7 +81,7 @@ def filter_and_save_pdfs(folder_path, csv_path, output_csv_path):
             papList.append(
                 Paper(
                     row["title"],
-                    row["firstAuthor"],
+                    row["authors"],
                     row["url"],
                     row["dateSubmitted"],
                     row["keywords"],
 
@@ -106,7 +106,7 @@
    "source": [
     "blacklist = pd.read_csv(\"../data/blacklist.csv\")\n",
     "blacklist[\"title\"] = blacklist[\"title\"].apply(lambda x: process_paper_title(x))\n",
-    "blacklist"
+    "len(blacklist)"
    ]
   },
   {
@@ -391,7 +391,7 @@
     "\n",
     "df_combined.to_csv(\"master_papers.csv\")\n",
     "\n",
-    "auto_pipeline(\"master_papers.csv\", \"papers\")"
+    "auto_pipeline(\"master_papers.csv\", \"papers/\")"
    ]
   }
  ],
 
@@ -85,6 +85,7 @@ def upload_folder(self, folderName):
         self.api.upload_folder(
             repo_id=self.repo_name,
             folder_path=folderName,
+            path_in_repo=folderName,
             commit_message=f"Add {folderName}",
             repo_type="dataset",
         )
 
@@ -0,0 +1,58 @@
+# Running a topic model on the data
+
+## Installation
+First, [install poetry](https://python-poetry.org/docs), then install this package with `poetry install`. NB, it may be possible to just install directly with `pip install -e .`, I haven't tested this.
+
+Type `soup-nuts --help` to make sure the preprocessing package was installed correctly. If it wasn't, clone [this repo](https://github.com/ahoho/topics) and try running `poetry install` there, then `poetry add tomotopy`.
+
+## Process data
+
+Download the CSV of papers and abstracts.
+
+```console
+ curl https://huggingface.co/datasets/PromptSystematicReview/Prompt_Systematic_Review_Dataset/blob/main/master_papers.csv -o master_papers.csv
+ ```
+
+Optionally learn common phrases (e.g., `prompt_engineering`):
+
+```bash
+mkdir ./detected-phrases
+
+soup-nuts detect-phrases \
+    master_papers.csv \
+    ./detected-phrases \
+    --input-format csv \
+    --text-key abstract \
+    --id-key paperId \
+    --lowercase \
+    --min-count 15 \
+    --token-regex wordlike \
+    --no-detect-entities                                                            
+```
+
+Preprocess the data---feel free to play around with these parameters (see `soup-nuts preprocess --help` for information)
+
+```bash
+soup-nuts preprocess \
+    master_papers.csv\
+    ./processed\
+    --text-key abstract \
+    --id-key paperId \
+    --lowercase \
+    --input-format csv \
+    --detect-entities \
+    --phrases ./detected-phrases/phrases.json \
+    --max-doc-freq 0.9 \
+    --min-doc-freq 2 \
+    --output-text \
+    --metadata-keys abstract,title,url \
+    --stopwords stopwords.txt
+```
+
+## Run the topic model
+
+```
+python run_tomotopy.py --num_topics 25 --iterations 1000
+```
+
+You can view the outputs in `topic_outputs-<num_topics>.html`
@@ -0,0 +1,22 @@
+{
+  "output_dir": "detected-phrases",
+  "input_format": "csv",
+  "passes": 1,
+  "lowercase": true,
+  "detect_entities": false,
+  "detect_noun_chunks": false,
+  "token_regex": "re.compile('^[\\\\w-]*[a-zA-Z][\\\\w-]*$')",
+  "min_count": 15,
+  "threshold": 10.0,
+  "max_vocab_size": 40000000.0,
+  "connector_words": "frozenset({'hereby', 'any', '\u2018re', 'when', 'only', 'nobody', 'this', 'whatever', 'whereby', 'ourselves', '\u2019ve', 'his', 'himself', 'afterwards', 'along', \"'ll\", 'mine', 'whenever', 'again', \"'s\", 'latter', \"'d\", 'across', 'five', 'n\u2018t', 'somewhere', 'may', 'themselves', 'did', 'nothing', 'whither', 'her', 'is', 'get', 'can', 'ours', 'could', 'keep', 'for', 'just', 'in', 'quite', 'no', 'such', 'hereafter', 'due', 'really', 'therefore', 'you', 'per', 'give', 'anything', 'using', 'whose', 'anyone', 'as', 'i', 'besides', 'therein', 'anywhere', '\u2018m', 'be', 'sixty', 'wherein', 'amount', 'name', 'whereafter', 'then', 'whereupon', 'still', 'your', \"'re\", 'less', 'make', 'one', 'hereupon', 'please', '\u2018d', 'of', 'yet', 'someone', 'while', 'without', 'how', 'here', 'does', 'whereas', '\u2019m', 'n\u2019t', 'fifty', 'once', 'but', 'empty', '\u2019s', 'thereupon', 'sometimes', 'regarding', 'itself', 'seems', 'front', 'with', '\u2019d', 'there', 'all', 'might', 'our', 'ever', 'were', 'why', 'done', 'many', 'nowhere', 'around', 'otherwise', 'upon', 'made', 'latterly', 'perhaps', 'forty', 'hers', 'these', 'him', 'something', 'namely', 'are', 'other', 'unless', 'until', 'doing', 'nevertheless', 'full', 'become', 'else', 'more', 'meanwhile', 'see', 'beyond', 'further', 'whence', 'among', 'behind', 'former', 'move', 'rather', 'that', 'seem', 'both', 'sometime', 'where', 'on', 'since', 'out', 'however', 'throughout', 'or', 'whole', \"'m\", 'also', 'than', 'few', 'well', 'me', 'often', 'own', '\u2019ll', 'my', 'except', 'wherever', 'least', 'twelve', \"'ve\", 'three', 'another', 'mostly', 'became', 'indeed', 'he', 're', 'always', 'beside', 'by', 'first', 'enough', 'whoever', 'serious', 'everything', 'thence', 'from', 'neither', 'if', 'under', 'anyhow', 'back', 'anyway', 'already', 'whom', 'above', 'us', 'put', 'it', 'onto', 'being', 'everywhere', 'twenty', 'thereby', 'even', 'thus', 'hundred', 'go', 'because', 'over', 'very', 'so', 'four', 'have', 'bottom', 'up', 'used', 'hence', 'seeming', 'everyone', 'each', 'show', 'yourselves', 'nine', 'elsewhere', 'ten', 'its', 'she', 'noone', 'about', 'off', 'never', 'not', 'too', 'next', 'into', 'becoming', 'thereafter', 'we', 'none', 'down', 'every', 'which', '\u2018ll', 'do', 'almost', 'top', 'via', 'several', 'nor', 'much', 'am', 'what', '\u2018ve', 'a', 'them', 'whether', 'their', 'and', 'they', 'side', 'thru', 'before', 'amongst', 'will', 'most', 'either', \"n't\", 'an', 'to', 'between', 'part', 'has', 'alone', 'below', 'together', 'some', 'becomes', 'formerly', 'beforehand', 'though', 'various', 'say', 'after', 'should', 'towards', 'now', 'through', 'must', 'somehow', 'although', 'herein', 'fifteen', 'eight', 'take', 'same', 'eleven', 'last', 'cannot', 'been', 'yours', 'at', 'third', 'had', 'was', 'call', 'others', 'would', 'moreover', 'who', 'those', 'ca', 'against', '\u2018s', '\u2019re', 'yourself', 'the', 'toward', 'two', 'six', 'herself', 'within', 'during', 'seemed', 'myself'})",
+  "phrases": null,
+  "max_phrase_len": null,
+  "n_process": -1,
+  "encoding": "utf-8",
+  "id_key": "paperId",
+  "input_path": "master_papers.csv",
+  "lines_are_documents": true,
+  "max_doc_size": null,
+  "text_key": "abstract"
+}
@@ -0,0 +1,183 @@
+wide_range
+benchmark_datasets
+multiple_choice
+parameter_efficient
+knowledge_intensive
+experiments_demonstrate
+future_research
+computer_vision
+promising_results
+find_relation
+contrastive_learning
+results_demonstrate
+black_box
+downstream_tasks
+open_source
+task_specific
+external_knowledge
+existing_methods
+f1_score
+world_scenarios
+paper_we_propose
+test_cases
+recent_work
+test_time
+fine_tune
+resource_languages
+code_generation
+publicly_available
+world_applications
+sentiment_analysis
+high_quality
+great_potential
+remarkable_capabilities
+end_to_end
+image_generation
+machine_learning
+prompting_technique
+superior_performance
+retrieval_augmented
+paper_we_introduce
+training_data
+language_models
+competitive_performance
+prompt_optimization
+hand_crafted
+point_cloud
+meta_learning
+shot_learning
+style_transfer
+fine_tuned
+lingual_transfer
+prompt_template
+find_event
+context_examples
+reinforcement_learning
+pre_trained
+ood_nlp
+human_like
+cot_prompting
+shot_setting
+code_is_available
+multi_task
+rosgpt_vision
+model_size
+work_we_propose
+named_entity
+chain_of_thought
+source_code
+high_level
+pre_training
+prior_work
+generalization_ability
+language_model
+multi_modal
+fully_supervised
+multi_hop
+zero_shot
+semantic_parsing
+results_indicate
+low_resource
+instruction_tuning
+relation_extraction
+f_1
+prompt_injection
+natural_language
+pretrained_language
+consistently_outperforms
+task_oriented
+text_to_sql
+step_by_step
+models_plms
+neural_networks
+data_augmentation
+propose_a_novel
+models_lms
+input_output
+proposed_method
+machine_translation
+cross_lingual
+intelligence_ai
+training_examples
+shot_settings
+instruction_following
+extensive_experiments
+processing_nlp
+significantly_outperforms
+findings_suggest
+propose_a_new
+language_processing
+general_purpose
+ground_truth
+prompting_techniques
+fact_checking
+achieves_state
+demonstrated_remarkable
+prompt_engineering
+shot_prompting
+new_paradigm
+text_classification
+novel_approach
+jailbreak_prompts
+multi_step
+real_world
+nlp_tasks
+generative_ai
+r_score
+domain_specific
+small_number
+labeled_data
+big_bench
+text_to_image
+large_scale
+human_written
+recent_advances
+paper_presents
+experimental_results
+fewshot_lama
+vision_language
+e_commerce
+foundation_models
+large_languagemodels
+large_language
+prompt_based
+information_extraction
+stable_diffusion
+shown_impressive
+mental_health
+thought_prompting
+knowledge_graph
+method_achieves
+complex_reasoning
+paper_proposes
+self_supervised
+time_consuming
+inthis_paper
+recent_years
+like_chatgpt
+reasoning_steps
+conduct_extensive
+trained_language
+fine_grained
+prompt_templates
+llm_articulated_object_manipulation
+fine_tuning
+success_rate
+entity_recognition
+et_al
+models_llms
+artificial_intelligence
+annotated_data
+state_of_the_art
+context_learning
+social_media
+learning_icl
+knowledge_distillation
+thought_cot
+demonstrate_the_effectiveness
+question_answering
+largelanguage_models
+decision_making
+open_domain
+paper_we_present
+object_detection
Original file line number	Diff line number	Diff line change
`@@ -106,7 +106,7 @@`
`106`	`106`	`"source": [`
`107`	`107`	`"blacklist = pd.read_csv(\"../data/blacklist.csv\")\n",`
`108`	`108`	`"blacklist[\"title\"] = blacklist[\"title\"].apply(lambda x: process_paper_title(x))\n",`
`109`		`- "blacklist"`
	`109`	`+ "len(blacklist)"`
`110`	`110`	`]`
`111`	`111`	`},`
`112`	`112`	`{`
`@@ -391,7 +391,7 @@`
`391`	`391`	`"\n",`
`392`	`392`	`"df_combined.to_csv(\"master_papers.csv\")\n",`
`393`	`393`	`"\n",`
`394`		`- "auto_pipeline(\"master_papers.csv\", \"papers\")"`
	`394`	`+ "auto_pipeline(\"master_papers.csv\", \"papers/\")"`
`395`	`395`	`]`
`396`	`396`	`}`
`397`	`397`	`],`
Original file line number	Diff line number	Diff line change
`@@ -85,6 +85,7 @@ def upload_folder(self, folderName):`
`85`	`85`	`self.api.upload_folder(`
`86`	`86`	`repo_id=self.repo_name,`
`87`	`87`	`folder_path=folderName,`
	`88`	`+ path_in_repo=folderName,`
`88`	`89`	`commit_message=f"Add {folderName}",`
`89`	`90`	`repo_type="dataset",`
`90`	`91`	`)`