-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrap_jornal_sc.py
101 lines (88 loc) · 3.84 KB
/
scrap_jornal_sc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
'''
Created on Oct 22, 2018
@authors: Vinicius Freitas
'''
import scrapy
import psycopg2
import db_settings
from scrapy.http import Request
from datetime import datetime
class JORNAL_SC(scrapy.Spider):
dbname = db_settings.DBNAME
dbhost = db_settings.DBHOST
dbuser = db_settings.DBUSER
dbpass = db_settings.DBPASS
name = 'JORNAL DE SANTA CATARINA'
start_urls = ['http://jornaldesantacatarina.clicrbs.com.br/sc/ultimas-noticias/']
url_base = 'http://jornaldesantacatarina.clicrbs.com.br/sc/ultimas-noticias/'
debug = True
def __init__(self):
print("init")
self.conn = psycopg2.connect("dbname='" + self.dbname +
"' user='" + self.dbuser +
"' host='" + self.dbhost +
"' password='" + self.dbpass + "'")
def parse(self, response):
pg = response.meta['page'] if 'page' in response.meta else 1
if pg > 100:
return
news_list = response.xpath("//div[@class='conteudo-lista']")
for news in news_list: # Iterate through each of the menus
def parse_tags_and_date():
sub_and_date_html = news.xpath("p[@class='materia-cabecalho']")
subject = sub_and_date_html.xpath("span[contains(@class, 'editoria')]//text()").extract_first()
date_txt = sub_and_date_html.xpath("span[@class='data-publicacao']//text()").extract_first()
tags = news.css("div .lista-tags").css("li").xpath("a/@title").extract()
return (subject, date_txt, tags)
(sub, date, tags) = parse_tags_and_date()
date_time = datetime.strptime(date, "%d/%m/%Y | %Hh%M")
title_link_html = news.css("h2")
title = title_link_html.xpath("a/@title").extract_first()
next_link = title_link_html.xpath("a/@href").extract_first()
print("\n\n***\n\n")
print(title)
print(sub)
print(date)
print(next_link)
print(tags)
req = Request(next_link, callback=self.parse_news)
req.meta['title'] = title
req.meta['subject'] = sub
req.meta['date'] = date_time
req.meta['tags'] = tags
yield req
next_page = self.url_base+"?pagina="+str(pg)
print("Crawl to: "+next_page)
req = Request(next_page, callback=self.parse)
req.meta['page'] = pg+1
yield req
def parse_news(self, response):
title = response.meta['title']
subtitle = ''
subject = response.meta['subject']
tags = response.meta['tags']
date_time = response.meta['date']
def parse_author():
author = response.xpath("//div[@class='materia-assinatura']//text()").extract_first()
return author
def parse_text():
text_body = response.xpath("//div[contains(@class, 'materia-corpo')]//p//text()").extract()
text = ""
for part in text_body:
text += part + " "
return text
author = parse_author()
text = parse_text()
link = response.url
portal = 'Jornal de SC'
query = "insert into news (title, subtitle, date_time, text, authors, portal, tags, subject, link) " + \
"values ($tag$" + title + "$tag$, $tag$" + subtitle + "$tag$, $tag$" + str(date_time) + \
"$tag$, $tag$" + text + "$tag$, $tag$" + author + "$tag$, $tag$" + \
portal + "$tag$, $tag$" + str(tags).replace("'", '') + "$tag$, $tag$" + subject + "$tag$, $tag$" + link + "$tag$)"
cur = self.conn.cursor()
try:
cur.execute(query)
self.conn.commit()
except Exception as e:
print("\n\n\nQuery Error: " + str(e) + "\n\n\n\n")
self.conn.rollback()