Skip to content

Commit

Permalink
NyaaRSS is complete. NyaaScraper is temporarily broken
Browse files Browse the repository at this point in the history
  • Loading branch information
mpierce35 committed Apr 3, 2022
1 parent a4fdaf1 commit 6b85df1
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 72 deletions.
68 changes: 35 additions & 33 deletions NyaaTranspiler/entities/DataProcess.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@
from bs4 import BeautifulSoup
class DataProcess(object):
def __init__(self):
self.base__url = "http://nyaa.si/"
self.base__dir = os.path.dirname(__file__)
self.base__url = "http://nyaa.si/?"
self.base__rss_url = "https://nyaa.si/?page=rss"
self.base__torrent__link = "https://nyaa.si/download/"
self.base__view__link = "https://nyaa.si/view/"
self.base__dir = os.path.dirname(__file__)


def get_torrent_link(self, url):
Expand All @@ -34,25 +35,15 @@ def get_magnet_link(self, url):
soup = BeautifulSoup(html, 'lxml')
return soup.find('a', 'card-footer-item').get('href').strip()

def parse_rss_feed(self, url, limit=None, _desc=None):
def _parse_rss_feed(self, url=None, limit=None):
_count = 0
""""Parse the RSS feed coming from Nyaa.si website
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Basic usage:
self.parse_rss_feed('http://nyaa.si/?page=rss&more_query_chain_here', 'json')
Args:
url (str): URL to the desired RSS feed
type (str, optional): Data structure returned from the function:
dict -> Returns a dictionnary with key/value pairs
json -> Returns a JSON notation
Default value -> JSON notation.
"""
url = self.base__rss_url if url is None else url
html = requests.get(url).content
soup = BeautifulSoup(html, features='lxml')
# saving data as an ordered list
obj = OrderedDict({
"title" : 'Nyaa - Home - Torrent File Feed Parser',
"description": f'Feed Parser for {_desc}',
"description": f'Feed Parser for Home',
"atom": {
'link': soup.find('atom:link').get('href'),
'rel': soup.find('atom:link').get('rel'),
Expand Down Expand Up @@ -93,17 +84,15 @@ def parse_rss_feed(self, url, limit=None, _desc=None):
return obj


# Quality query is missing
def create_search_query(self, filter_=None, search_string=None, category=None, username=None, search_type="rss"):
base_url = 'https://nyaa.si/?page=rss' if search_type == 'rss' else "https://nyaa.si/?"
def _create_search_query(self, filter_=None, search_query=None, category=None, username=None, search_type=None):
base_url = self.base__rss_url if search_type == 'rss' else self.base__url
query_array = list()
query = str()
rss_queries = ['f', 'q', 'c', 'u']
if filter_ is not None:
query_array.append(dict({"f" : filter_}))
if search_string is not None:
search_string = search_string.replace(' ', '+')
query_array.append(dict({"q": search_string}))
if search_query is not None:
search_query = search_query.replace(' ', '+')
query_array.append(dict({"q": search_query}))
if category is not None:
query_array.append(dict({"c" : category}))
if username is not None:
Expand All @@ -112,18 +101,16 @@ def create_search_query(self, filter_=None, search_string=None, category=None, u
for q in query_array:
for key, value in q.items():
query += f"&{key}={value}"
return (base_url + query)

link = base_url + query
print(f"Search link: {link}")
return link

# RSS torrent file retrieval
def get_torrent_files(self, url, limit=None):
feed_data = self.parse_rss_feed(url, limit=limit)
def _rss_get_torrent_files(self, url=None, limit=None):
feed_data = self._parse_rss_feed(url=url, limit=limit)
return self.get_data(feed_data)

def get_magnet(self, id_):
view_link = "{0}{1}".format(self.base__view__link, str(id_))
html = requests.get(view_link).content
soup = BeautifulSoup(html, 'lxml')
return soup.find('a', 'card-footer-item').get('href')

def get_file(self, id_):
try:
Expand Down Expand Up @@ -154,21 +141,30 @@ def get_file(self, id_):

# get multiple files from structure
def get_data(self, item_list):
"""
Download torrent files from a list of item provided by _parse_rss_feed()
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
args:
item_list --> a list of items with item'[torrent_link'] attr to retrieve torrent link
"""
try:
_count = 0
base_dir = os.path.dirname(__file__)
mdir = os.path.join(base_dir, "automated")
mdir = os.path.join(self.base__dir, "automated")
# check if directory exists
if os.path.exists(mdir) == False:
os.mkdir(mdir)
print('Directory created.')
else:
print('directory exists.')

for item in item_list['data']:
with requests.get(item['torrent_file'], stream=True) as r:
r.raise_for_status()
invalid_chars = f'<>:"\/|?*'
pattern = r'[' + invalid_chars + ']'
new_name = re.sub(pattern, ' ', item['title'])[:155] # As Windows files are 155 character-limited.
new_name = re.sub(pattern, ' ', item['title'])[:155] # As Windows files are 155 character-limited.
with open(os.path.join(mdir, 'log.txt'), 'a', encoding='utf-8') as log:
log.write(f"File saved: {new_name}.torrent \n")
with open(os.path.join(mdir, f"{new_name}.torrent"), "wb") as f:
Expand All @@ -178,6 +174,12 @@ def get_data(self, item_list):
_count += 1
finally:
print(f"Downloaded {_count} torrent files.")

def get_magnet(self, id_):
view_link = "{0}{1}".format(self.base__view__link, str(id_))
html = requests.get(view_link).content
soup = BeautifulSoup(html, 'lxml')
return soup.find('a', 'card-footer-item').get('href')


# This is purely exprimental, not guaranteed to
Expand Down
83 changes: 44 additions & 39 deletions NyaaTranspiler/entities/NyaaRSS.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
TODO:
---Don't overwrite exisiting torrent files/data
---overwrite/not existing torrent files/data
---Check query if user has submitted valid input
---Add more debug console data.
---if page returns empty, put an exception.
Expand All @@ -16,60 +16,65 @@
import pprint
import string
class NyaaRSS(DataProcess):
def RSS_get_latest_feed_data(self, rtype='dict', limit=None):
feed_data = self.parse_rss_feed("https://nyaa.si/?page=rss&", limit=limit)
def __init__(self):
super().__init__()


def get_latest_feed_data(self, rtype='dict', limit=None):
pp = pprint.PrettyPrinter(indent=4)
feed_data = self._parse_rss_feed(limit=limit)
try:
if rtype == 'json':
return json.dumps(feed_data)
if rtype == 'dict':
return feed_data
if rtype == 'debug':
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(type(obj))
if rtype is not ['json', 'dict', 'debug']:
raise TypeError('Invalid type, try again. i.e --> type="dict"/type="json"')
print(f"Object type: {feed_data.__class__()}")
pp.pprint(feed_data)
except JSONDecodeError:
raise ('Invalid type, try again. i.e --> rtype="dict"/rtype="json"')
raise ('Error while parsing data to JSON notation.')

def RSS_get_latest_torrent_files(self, limit=None):
return self.get_torrent_files("https://nyaa.si/?page=rss&", limit=limit)
def get_latest_torrent_files(self, limit=None):
return self._rss_get_torrent_files(limit=limit)


def RSS_query_search_data(self,
filter_type=None,
query=None,
category=None,
username=None,
limit=None):
def query_search_data(self,
filter_=None,
search_query=None,
category=None,
username=None,
limit=None):

search_url = self.create_search_query(filter_=filter_type,
search_string=query,
search_url = self._create_search_query(filter_=filter_,
search_query=search_query,
category=category,
username=username)
username=username,
search_type='rss')

print(f"Search link: {search_url}")
return self.parse_rss_feed(search_url, limit=limit, _desc=query)
return self._parse_rss_feed(search_url, limit=limit)


def RSS_get_query_search_torrents(self,
filter_type=None,
query=None,
category=None,
username=None,
limit=None):
search_url = self.create_search_query(filter_=filter_type,
search_string=query,
def query_search_torrents(self,
filter_=None,
search_query=None,
category=None,
username=None,
limit=None):

search_url = self._create_search_query(filter_=filter_,
search_query=search_query,
category=category,
username=username)
self.get_torrent_files(search_url, limit=limit)
username=username,
search_type='rss')

self._rss_get_torrent_files(url=search_url, limit=limit)


def RSS_search_data_by_username(self, username=None, limit=None):
search_url = self.create_search_query(username=username)
print(f"username: {username} \n search link: {search_url}")
return self.parse_rss_feed(search_url, limit=limit)
def get_data_by_username(self, username=None, limit=None):
search_url = self._create_search_query(username=username, search_type='rss')
return self._parse_rss_feed(search_url, limit=limit)

def RSS_get_torrents_by_username(self, username=None, limit=None):
search_url = self.create_search_query(username=username)
print(f" username: {username}\nsearch link: {search_url}")
self.get_torrent_files(search_url, limit=limit)
def get_torrents_by_username(self, username=None, limit=None):
search_url = self._create_search_query(username=username, search_type='rss')
self._rss_get_torrent_files(search_url, limit=limit)

Binary file modified NyaaTranspiler/entities/__pycache__/DataProcess.cpython-310.pyc
Binary file not shown.

0 comments on commit 6b85df1

Please sign in to comment.