Feat: made modification to version1 scrapper

derhnyel · Feb 25, 2024 · a13a1fe · a13a1fe
1 parent 2d9ded8
commit a13a1fe
Show file tree

Hide file tree

Showing 5 changed files with 146 additions and 20 deletions.
diff --git a/engine/root.py b/engine/root.py
@@ -53,7 +53,7 @@ class BaseEngine(ABC):
     results=None
 
     def __init__(self) -> None:
-        super().__init__()
+        super().__init
 
     @abstractmethod
     def search(self, query=None, page=None, category=None,**kwargs):
@@ -141,7 +141,7 @@ async def fetch_webpage(
             self, session: aiohttp.ClientSession, url: str, method=None): 
         """
         Returns the source source code of a webpage, if it exist or None.
-
+ 
         :rtype
         :session: resusable aiohttp.ClientSession
         :param url: URL to pull it's source code

diff --git a/engine/songslover.py b/engine/songslover.py
@@ -19,7 +19,7 @@ def __init__(self):
         super().__init__()
         self.site_uri = "https://songslover.me/"
         self.request_method = self.GET
-
+ 
     def custom_fetch(self, category="tracks", page=1, **kwargs):
         # Parse Uri
         url = self.get_formated_url(url=kwargs.pop('url'),path="",params='') if kwargs 
@@ -40,7 +40,7 @@ def custom_fetch(self, category="tracks", page=1, **kwargs):
 
     def custom_parser(self, link, category, **kwargs):
 
-        # Get song page
+        # Get song Webpage
         soup = self.get_response_object(link)
 
         # Extract artist, song title, art and download_link
@@ -51,9 +51,8 @@ def custom_parser(self, link, category, **kwargs):
 
         # For track extract download link.    
         if category == "tracks":
-            if download_link!=None:
-                return dict(type='track',category=category,artist=artist,title=title,
-                    download=download_link,art=art_link)
+            return dict(type='track',category=category,artist=artist,title=title,
+                download=download_link,art=art_link)
 
 
         # For album extract get each song and their link
@@ -63,7 +62,6 @@ def custom_parser(self, link, category, **kwargs):
                 download=download_link,art=art_link,details=track_details)
 
 
-
     def fetch(self,category='tracks',page=1,**kwargs):
         """Fetch Latest Items Based on Category and Page Number"""
 
@@ -76,6 +74,7 @@ def fetch(self,category='tracks',page=1,**kwargs):
         self.results =  self.parse_parent_object(soup,**kwargs)
         return self.results
 
+
     def search(self,query='',page=1,category=None,**kwargs):
         """Search Engine with query, page ,category parameters"""
 
@@ -162,6 +161,7 @@ def get_url_path(self, page=None, category=None):
             page = 255
         return (category, self.page_path, str(page)) 
 
+
     def _get_description(self, soup):
         description = soup.select(
             'div[class="post-inner"] h1 span[itemprop="name"]'
@@ -174,6 +174,7 @@ def _get_description(self, soup):
 
         return artist, title
 
+
     def _get_art_link(self, soup):
         """Generate the art link of a song
 
@@ -183,11 +184,14 @@ def _get_art_link(self, soup):
         try:
             art_link = soup.select('div[class="entry"] img[src]')[0]["data-src"]
         except Exception:
-            art_link = None
+            art_link = "Unavailable"
+
         return art_link
 
+
     def _get_download_link(self,soup, category):
         download_link = None
+
         if category == "track":
             # Handle different Formats to Get Download Link
             regex_group = [

diff --git a/engine/version2/root.py b/engine/version2/root.py
@@ -8,7 +8,7 @@
 from urllib.parse import urlparse, urlencode
 from abc import ABC, abstractmethod
 from bs4 import BeautifulSoup as bs4
-from typing import List
+from typing import List, Tuple
 import utils.helpers as helpers
 
 
@@ -20,17 +20,83 @@ class BaseEngine(ABC):
     # Engine properties
     summary: str = None
     name: str = None
-    categories: List[str] = []
+    categories: Tuple[str] = []
     site_uri: str = None
-    request_methods =
+    request_methods = None
+    formated_url = None
     GET, POST = "get", "post"
 
-    @abstractmethod
-    def search(self,query: str=None,page: str=None, 
-              category: str=None,**kwargs):
+    @abstractmethod 
+    def search(
+        self, query: str=None, page: str=None, 
+        category: str=None,**kwargs
+        ):
         """ 
         Method to query a particular engine
         """
         raise NotImplementedError()
 
-
+    def get_formated_url(
+        self,url=None,path=None,
+        page=None, category=None, 
+        query=None, method=None,
+        params=None, **kwargs
+        ):
+        """
+        Return a formatted Music Engine search or fetch url
+        """
+        # Url could be set to default site Url or A custom url can be inputted 
+        url = urlparse(self.site_uri) if url is None else urlparse(url)
+
+        # Get Url paths and Query Params based on custom path or params passed in or the get_url_path/get_query_params method 
+        url_path = helpers.join_url_path(self.get_url_path(
+            page=page,category=category,**kwargs)
+            ) if path is None else helpers.join_url_path(path)    
+
+        params = self.get_query_params(
+            query=query,
+            page=page,
+            category=category,
+            **kwargs) if params is None else params
+
+        # Defined request method to Get/Post and use queries where applicable     
+        method = self.request_method if method is None else method
+        self.formated_url = (
+            url._replace(
+                path=url_path, query=urlencode(params)
+            )
+            if method == self.GET
+            else url._replace(path=url_path)
+        )
+
+        return self.formated_url.geturl()
+
+
+    @helpers.force_async
+    def get_response_objet(
+            self, session: aiohttp.ClientSession, url: str, method=None, payload=None): 
+        """
+        Returns the source source code of a webpage, if it exist or None.
+
+        :rtype
+        :session: resusable aiohttp.ClientSession
+        :param url: URL to pull it's source code
+        :method: str -> request method post/get
+        :payload: dict -> A payload For post requests
+        :header: dict -> The request header
+        :return: Html source code or Json of a given URL.
+        """
+
+        if method == self.POST:
+            pass  
+        # Get url asynchronously 
+        async with session.get(url) as response:
+            if response.status == 200: 
+                webpage = response.text('utf-8')    
+                soup = bs4(webpage, "html.parser")
+                return soup
+            else:
+                return None
+
+
+
diff --git a/engine/version2/songslovers.py b/engine/version2/songslovers.py
@@ -0,0 +1,43 @@
+import asyncio, time
+import re
+from engine.version2.root import BaseEngine
+from tqdm import tqdm
+from itertools import repeat
+
+
+class SongsLover(BaseEngine):
+    name = "Songslover"
+    summary = "SongsLover is a website that both search and fetch songs"
+    categories = (
+        'albums','tracks',
+        'best-of-the-month','mixtapes',
+        'music-albums'
+    )
+
+    def __init__(self):
+        super().__init__()
+        self.site_uri = "https://songslover.me/"
+        self.request_method = self.GET
+
+    async def fetch(self, category="tracks", page=1, **kwargs):
+        # Parse Uri
+        url = self.get_formated_url(url=kwargs.pop('url'),path="",params='') if kwargs 
+        else self.get_formated_url(category=category,page=page,params={},**kwargs)
+
+        # Get Page containing tracks
+        soup = self.get_response_object(url)
+
+        # Extract track links from page
+        track_list = [elem['href'] for elem in soup.select("article h2 a")]
+
+        # extract each detail for each track asynchronously
+        result = asycnio.gather(
+                )
+
+        with ProcessPoolExecutor() as executor:
+            result = executor.map(self.custom_parser, track_list, repeat(category))
+
+        return [elem for elem in result]
+
+
+
diff --git a/utils/helpers.py b/utils/helpers.py
@@ -4,6 +4,8 @@
 import validators
 from validators import ValidationFailure
 import json
+import functools
+import asyncio
 
 
 USER_AGENTS = [
@@ -82,12 +84,23 @@ def numerize(num):
         return f"{num/10**6:.1f}M"
     return num
 
-
-def noop():
-    pass
-
 class CacheHandler:
     def cache_fetched_items():
         return "Fetched results stored in a cache"
 
 
+# Convert a sync function to async 
+def force_async(fn):
+    '''
+    turns a sync function to async function using threads
+    '''
+    from concurrent.futures import ThreadPoolExecutor
+    import asyncio
+    pool = ThreadPoolExecutor()
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        future = pool.submit(fn, *args, **kwargs)
+        return asyncio.wrap_future(future)  # make it awaitable
+
+    return wrapper