-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmulti_crawl.py
47 lines (37 loc) · 1.36 KB
/
multi_crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# -*- coding: utf-8 -*-
#+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
#|r|e|d|a|n|d|g|r|e|e|n|.|c|o|.|u|k|
#+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.crawler import CrawlerProcess
from scrapy import Request
from urllib.parse import urlparse
import tldextract
class LkSpider(CrawlSpider):
name = 'multi_spider'
# read csv with just url per line
with open('urls.csv') as file:
start_urls = [line.strip() for line in file]
def start_request(self):
request = Request(url = self.start_urls, callback=self.parse)
yield request
rules = (
Rule(LinkExtractor(), callback='parse_item', follow=True),
)
def parse_item(self, response):
# get the domain for the file name
domain = tldextract.extract(response.request.url)[1]
path = urlparse(response.request.url)[2].replace("/", "")
descriptions = response.xpath('*//p/text()').getall()
description = ''.join(descriptions)
description = description[:1200]
filename = response.url.split("/")[-2] + '.txt'
with open("output/" + domain + "_" + filename, 'w') as f:
f.write(description)
# main driver
if __name__ == "__main__":
process = CrawlerProcess()
process.crawl(LkSpider)
process.start()