made some final changes, and updated readme

shivendrra · Mar 10, 2024 · 5837afe · 5837afe
1 parent d3676eb
commit 5837afe
Show file tree

Hide file tree

Showing 6 changed files with 73 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# data-collection
+# web-graze
 
 ## Introduction
 this repo contains codes that would help you to scrape data from various sites on the internet like wikipedia, britannca, youtube, etc.
@@ -13,7 +13,14 @@ bs = Scrapper(search_queries=['antarctica', 'america', 'continents'], max_limit=
 bs(out_file='../scrapped_data.txt.')
 ```
 
-I've made a sample `search_queries.json` that contains few keywords that could be used to scraped the pages. You can use your own, though
+I've made a sample `search_queries.json` that contains few keywords that could be used to scraped the pages. You can use your own, though.
+
+```python
+from britannica import searchQueries
+
+queries = searchQueries()
+print(queries())
+```
 
 ### Transcripts Collector
 It uses [Youtube V3 api](https://developers.google.com/youtube/v3/docs) to fetch uploaded videos by a particular channel and then generates `video_ids` which then is used to generate transcripts using [youtube-transcripts-api](https://github.com/jdepoix/youtube-transcript-api/tree/master).
@@ -62,10 +69,11 @@ print(snippets())
 .
 ├── britannica
 │   ├── __init__.py
-│   ├── URLFetcher.py
-│   ├── requirements.txt
 │   ├── main.py
+│   ├── queries.py
+│   ├── requirements.txt
 │   ├── search_queries.json
+│   ├── URLFetcher.py
 ├── javascript
 │   ├── customLinkFinder.js
 │   ├── customSearch.js
@@ -76,16 +84,18 @@ print(snippets())
 ├── youtube_transcripts
 │   ├── __init__.py
 │   ├── basic.py
-│   ├── snippets.py
-│   ├── channel_ids.json
 │   ├── channe_ids_snippet.json
-│   ├── requirements.txt
+│   ├── channel_ids.json
 │   ├── main.py
+│   ├── requirements.txt
+│   ├── snippets.py
 │   ├── version2.py
 ├── .env
 ├── .gitignore
-├── README.md
+├── CONTRIBUTING.md
 ├── LargeDataCollector.ipynb
+├── LICENSE
+├── README.md
 ├── test.py
 ```
 

diff --git a/britannica/__init__.py b/britannica/__init__.py
@@ -1,2 +1,3 @@
 from .main import Scrapper
-from .URLFetcher import BritannicaUrls
+from .URLFetcher import BritannicaUrls
+from .queries import searchQueries
diff --git a/britannica/__pycache__/URLFetcher.cpython-311.pyc b/britannica/__pycache__/URLFetcher.cpython-311.pyc
diff --git a/britannica/__pycache__/__init__.cpython-311.pyc b/britannica/__pycache__/__init__.cpython-311.pyc
diff --git a/britannica/__pycache__/main.cpython-311.pyc b/britannica/__pycache__/main.cpython-311.pyc
diff --git a/britannica/queries.py b/britannica/queries.py
@@ -0,0 +1,53 @@
+"""
+  --> contains some sample search queries for the britannica scrapper
+"""
+
+from typing import Any
+
+
+class searchQueries:
+  def __init__(self):
+    self.search_queries = [
+      "antarctica",
+      "colonization",
+      "world war",
+      "asia",
+      "africa",
+      "australia",
+      "holocaust",
+      "voyages",
+      "biological viruses",
+      "Martin Luther King Jr",
+      "Abraham Lincon",
+      "Quarks",
+      "Quantum Mechanincs",
+      "Biological Viruses",
+      "Drugs",
+      "Rockets",
+      "Physics",
+      "Mathematics",
+      "nuclear physics",
+      "nuclear fusion",
+      "CRISPR CAS-9",
+      "virginia woolf",
+      "cocaine", 
+      "marijuana",
+      "apollo missions",
+      "birds",
+      "blogs",
+      "journal",
+      "Adolf Hitler",
+      "Presidents of United States",
+      "genders and sexes",
+      "journalism",
+      "maths theories",
+      "matter and particles",
+      "discoveries",
+      "authoers and writers",
+      "poets and novel writers",
+      "literature",
+      "awards and honors"
+    ]
+
+  def __call__(self):
+    return self.search_queries