Skip to content

Commit

Permalink
Feat: made modification to version1 scrapper
Browse files Browse the repository at this point in the history
  • Loading branch information
Peter-Immanuel committed Feb 25, 2024
1 parent 2d9ded8 commit a13a1fe
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 20 deletions.
4 changes: 2 additions & 2 deletions engine/root.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class BaseEngine(ABC):
results=None

def __init__(self) -> None:
super().__init__()
super().__init

@abstractmethod
def search(self, query=None, page=None, category=None,**kwargs):
Expand Down Expand Up @@ -141,7 +141,7 @@ async def fetch_webpage(
self, session: aiohttp.ClientSession, url: str, method=None):
"""
Returns the source source code of a webpage, if it exist or None.
:rtype
:session: resusable aiohttp.ClientSession
:param url: URL to pull it's source code
Expand Down
18 changes: 11 additions & 7 deletions engine/songslover.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def __init__(self):
super().__init__()
self.site_uri = "https://songslover.me/"
self.request_method = self.GET

def custom_fetch(self, category="tracks", page=1, **kwargs):
# Parse Uri
url = self.get_formated_url(url=kwargs.pop('url'),path="",params='') if kwargs
Expand All @@ -40,7 +40,7 @@ def custom_fetch(self, category="tracks", page=1, **kwargs):

def custom_parser(self, link, category, **kwargs):

# Get song page
# Get song Webpage
soup = self.get_response_object(link)

# Extract artist, song title, art and download_link
Expand All @@ -51,9 +51,8 @@ def custom_parser(self, link, category, **kwargs):

# For track extract download link.
if category == "tracks":
if download_link!=None:
return dict(type='track',category=category,artist=artist,title=title,
download=download_link,art=art_link)
return dict(type='track',category=category,artist=artist,title=title,
download=download_link,art=art_link)


# For album extract get each song and their link
Expand All @@ -63,7 +62,6 @@ def custom_parser(self, link, category, **kwargs):
download=download_link,art=art_link,details=track_details)



def fetch(self,category='tracks',page=1,**kwargs):
"""Fetch Latest Items Based on Category and Page Number"""

Expand All @@ -76,6 +74,7 @@ def fetch(self,category='tracks',page=1,**kwargs):
self.results = self.parse_parent_object(soup,**kwargs)
return self.results


def search(self,query='',page=1,category=None,**kwargs):
"""Search Engine with query, page ,category parameters"""

Expand Down Expand Up @@ -162,6 +161,7 @@ def get_url_path(self, page=None, category=None):
page = 255
return (category, self.page_path, str(page))


def _get_description(self, soup):
description = soup.select(
'div[class="post-inner"] h1 span[itemprop="name"]'
Expand All @@ -174,6 +174,7 @@ def _get_description(self, soup):

return artist, title


def _get_art_link(self, soup):
"""Generate the art link of a song
Expand All @@ -183,11 +184,14 @@ def _get_art_link(self, soup):
try:
art_link = soup.select('div[class="entry"] img[src]')[0]["data-src"]
except Exception:
art_link = None
art_link = "Unavailable"

return art_link


def _get_download_link(self,soup, category):
download_link = None

if category == "track":
# Handle different Formats to Get Download Link
regex_group = [
Expand Down
80 changes: 73 additions & 7 deletions engine/version2/root.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from urllib.parse import urlparse, urlencode
from abc import ABC, abstractmethod
from bs4 import BeautifulSoup as bs4
from typing import List
from typing import List, Tuple
import utils.helpers as helpers


Expand All @@ -20,17 +20,83 @@ class BaseEngine(ABC):
# Engine properties
summary: str = None
name: str = None
categories: List[str] = []
categories: Tuple[str] = []
site_uri: str = None
request_methods =
request_methods = None
formated_url = None
GET, POST = "get", "post"

@abstractmethod
def search(self,query: str=None,page: str=None,
category: str=None,**kwargs):
@abstractmethod
def search(
self, query: str=None, page: str=None,
category: str=None,**kwargs
):
"""
Method to query a particular engine
"""
raise NotImplementedError()


def get_formated_url(
self,url=None,path=None,
page=None, category=None,
query=None, method=None,
params=None, **kwargs
):
"""
Return a formatted Music Engine search or fetch url
"""
# Url could be set to default site Url or A custom url can be inputted
url = urlparse(self.site_uri) if url is None else urlparse(url)

# Get Url paths and Query Params based on custom path or params passed in or the get_url_path/get_query_params method
url_path = helpers.join_url_path(self.get_url_path(
page=page,category=category,**kwargs)
) if path is None else helpers.join_url_path(path)

params = self.get_query_params(
query=query,
page=page,
category=category,
**kwargs) if params is None else params

# Defined request method to Get/Post and use queries where applicable
method = self.request_method if method is None else method
self.formated_url = (
url._replace(
path=url_path, query=urlencode(params)
)
if method == self.GET
else url._replace(path=url_path)
)

return self.formated_url.geturl()


@helpers.force_async
def get_response_objet(
self, session: aiohttp.ClientSession, url: str, method=None, payload=None):
"""
Returns the source source code of a webpage, if it exist or None.
:rtype
:session: resusable aiohttp.ClientSession
:param url: URL to pull it's source code
:method: str -> request method post/get
:payload: dict -> A payload For post requests
:header: dict -> The request header
:return: Html source code or Json of a given URL.
"""

if method == self.POST:
pass
# Get url asynchronously
async with session.get(url) as response:
if response.status == 200:
webpage = response.text('utf-8')
soup = bs4(webpage, "html.parser")
return soup
else:
return None



43 changes: 43 additions & 0 deletions engine/version2/songslovers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import asyncio, time
import re
from engine.version2.root import BaseEngine
from tqdm import tqdm
from itertools import repeat


class SongsLover(BaseEngine):
name = "Songslover"
summary = "SongsLover is a website that both search and fetch songs"
categories = (
'albums','tracks',
'best-of-the-month','mixtapes',
'music-albums'
)

def __init__(self):
super().__init__()
self.site_uri = "https://songslover.me/"
self.request_method = self.GET

async def fetch(self, category="tracks", page=1, **kwargs):
# Parse Uri
url = self.get_formated_url(url=kwargs.pop('url'),path="",params='') if kwargs
else self.get_formated_url(category=category,page=page,params={},**kwargs)

# Get Page containing tracks
soup = self.get_response_object(url)

# Extract track links from page
track_list = [elem['href'] for elem in soup.select("article h2 a")]

# extract each detail for each track asynchronously
result = asycnio.gather(
)

with ProcessPoolExecutor() as executor:
result = executor.map(self.custom_parser, track_list, repeat(category))

return [elem for elem in result]



21 changes: 17 additions & 4 deletions utils/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import validators
from validators import ValidationFailure
import json
import functools
import asyncio


USER_AGENTS = [
Expand Down Expand Up @@ -82,12 +84,23 @@ def numerize(num):
return f"{num/10**6:.1f}M"
return num


def noop():
pass

class CacheHandler:
def cache_fetched_items():
return "Fetched results stored in a cache"


# Convert a sync function to async
def force_async(fn):
'''
turns a sync function to async function using threads
'''
from concurrent.futures import ThreadPoolExecutor
import asyncio
pool = ThreadPoolExecutor()

@functools.wraps(fn)
def wrapper(*args, **kwargs):
future = pool.submit(fn, *args, **kwargs)
return asyncio.wrap_future(future) # make it awaitable

return wrapper

0 comments on commit a13a1fe

Please sign in to comment.