Skip to content

Commit 8b72e43

Browse files
committed
Image Crawler Added
1 parent a59b46b commit 8b72e43

File tree

10 files changed

+66
-66
lines changed

10 files changed

+66
-66
lines changed

Crawler/Image.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
import urllib.parse
2+
from Crawler.ImgFinder import ImgFinder
3+
import Models.Queue.Image
4+
5+
6+
class Image:
7+
def __init__(self, page_url):
8+
9+
image = Models.Queue.Image.Image
10+
image.fetch()
11+
self.image = image
12+
13+
self.page_url = page_url
14+
urlres = urllib.parse.urlparse(page_url)
15+
self.base_url = urlres.netloc
16+
self.images = set()
17+
18+
def add(self, link):
19+
full_url = self.sanitize_url(link)
20+
if full_url:
21+
self.images.add(full_url)
22+
return self
23+
24+
def fetch_links(self):
25+
"""
26+
Get all the anchor tag url from the website
27+
:return:
28+
"""
29+
img_finder = ImgFinder(self.page_url)
30+
img_finder.feed(img_finder.html_string())
31+
self.images = img_finder.get_values()
32+
return self.images
33+
34+
def links(self):
35+
return self.images
36+
37+
def _merge_links(self):
38+
for lk in self.images:
39+
self.image.add(lk)
40+
return self.image.links
41+
42+
def save(self):
43+
self._merge_links()
44+
self.image.save()
45+
46+
def save_links(self):
47+
self.fetch_links()
48+
self.save()
49+
50+
51+

Crawler/ImgFinder.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from UrlFinder import UrlFinder
2+
3+
4+
class ImgFinder(UrlFinder):
5+
def __init__(self, page_url):
6+
UrlFinder.__init__(self, page_url, 'img', 'src')
1.67 KB
Binary file not shown.
472 Bytes
Binary file not shown.

Image/Finder.py

Lines changed: 0 additions & 62 deletions
This file was deleted.
-2.24 KB
Binary file not shown.
475 Bytes
Binary file not shown.
-6 Bytes
Binary file not shown.

functions.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import urllib.parse
33
import urllib.request
44

5-
from Image import Finder
5+
from Crawler import ImgFinder
66

77

88
def gather_img_src(page_url) -> object:
@@ -13,7 +13,7 @@ def gather_img_src(page_url) -> object:
1313
"""
1414
try:
1515
html = html_string(page_url)
16-
finder = Finder.ImgFinder(page_url)
16+
finder = ImgFinder.ImgFinder(page_url)
1717
finder.feed(html)
1818
except Exception as e:
1919
print(str(e))

test.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,12 @@
22
from Models.Complete.Image import Image
33
import os.path;
44
from Crawler.Page import Page
5+
from Crawler.Image import Image
6+
57
if __name__ == '__main__':
8+
"""
69
page=Page('https://gopostie.com')
7-
page.save()
8-
page.save_links()
10+
page.save_links()
11+
"""
12+
img = Image('https://gopostie.com')
13+
img.save_links()

0 commit comments

Comments
 (0)