-
Notifications
You must be signed in to change notification settings - Fork 0
/
corpus_crawler.py
79 lines (34 loc) · 1.13 KB
/
corpus_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# -*- coding: utf-8 -*-
"""
-------------------------------------------------
File Name: corpus_crawler
Description :
date: 2018/12/19
-------------------------------------------------
Change Activity:
2018/12/19:
-------------------------------------------------
"""
import lxml
import requests
from bs4 import BeautifulSoup
def get_head(path='../spider/headers.txt'):
with open(path, 'r') as f:
headers = eval(f.read())
return headers[3]
def get_tag(url):
respon = requests.get(url=url, headers=get_head())
soup = BeautifulSoup(respon.text, 'lxml')
tag = soup.find_all(name='div',
class_='js_selection_area',
id='main_content')[0]
return tag
def save_txt(url):
tag = get_tag(url)
with open('speech_corpus.txt', 'w', encoding='utf-8') as f:
for child in tag.children:
if child.string not in [None, '\n']:
f.write(child.string + '\n')
if __name__ == '__main__':
url = "http://finance.ifeng.com/a/20181218/16627248_0.shtml"
save_txt(url)