-
Notifications
You must be signed in to change notification settings - Fork 0
/
nepalScrap.py
79 lines (63 loc) · 2.42 KB
/
nepalScrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import timeit
#for scrapping
from bs4 import BeautifulSoup
import requests
from fake_useragent import UserAgent
# from fake_useragent import UserAgent
#date parser
from dateutil import parser as date_parser
from datetime import datetime
# mongodb
import pymongo
#other imports
import random
def clean_description(description):
soup_description =BeautifulSoup(description,'html.parser')
if (soup_description.find('p')==None):
c_description = soup_description.text
else:
c_description = soup_description.find("p").text
return c_description
def NepalNewsScrap():
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
breakfastDB = myclient["breakfast"]
breakfastDB["allNews"].drop()
newsCollection = breakfastDB["allNews"]
urls={
"the himalayan times":"https://thehimalayantimes.com/category/kathmandu/feed/",
"the himalayan times":"https://thehimalayantimes.com/category/nepal/feed/",
"OnlineKhabar":"https://english.onlinekhabar.com/category/political/feeds",
}
user_agent = UserAgent()
header ={"user-agent":user_agent.random}
# start = timeit.timeit()
for url in urls:
try:
response = requests.get(urls[url],headers=header)
pagesource = response.content
soup = BeautifulSoup(pagesource,'xml')
item_list = soup.find_all('item')
for post in item_list:
categorys=[]
aDic={}
title = post.find('title').text # getting title
pubdate = post.find('pubDate').text # getting publication data
link = post.find('link').text # getting link of the article
description = post.find('description').text
# soup_description =BeautifulSoup(post.find('description').text,'html.parser')
# cleaned
aDic['scrapTime']= datetime.now().strftime('%Y-%m-%d %H:%M:%S')
aDic['Publisher']=url
aDic['Title']=title
aDic['Publish_Date'] = date_parser.parse(pubdate)
aDic['Link'] = link
aDic['Description']=clean_description(description)
newsCollection.insert_one(aDic)
except Exception as e:
print(e)
pass
# end = timeit.timeit()
print("-----Sucess------------------")
# print(end-start)
if __name__=="__main__":
NepalNewsScrap()