-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathiphin2.py
92 lines (71 loc) · 1.8 KB
/
iphin2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import nltk
import requests
import bs4
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re
from collections import Counter
import operator
import gc
import pandas as pd
from pandas import read_csv
url1 = 'https://timesofindia.indiatimes.com/2018/10/20/archivelist/year-2018,month-10,starttime-43393.cms'
html = requests.get(url1)
soup = bs4.BeautifulSoup(html.text,"lxml")
for link in soup.select("a[href$='.cms']"):
url2 = link.get('href')
result = urlparse(url2)
ph_data = pd.read_csv(r"ph_ftrs51dic.csv",usecols=[0])
phword = []
for index, row in ph_data.iterrows():
phword.append(row['english'])
if all([result.scheme, result.netloc]):
r = requests.get(url2)
type(r)
html = r.text
soup = BeautifulSoup(html, "html5lib")
type(soup)
text = soup.get_text()
words = re.findall('\w+', text)
sw = nltk.corpus.stopwords.words('english')
words_ns = []
for word in words:
if word not in sw:
words_ns.append(word.lower())
word_freq = []
for s in phword:
n = operator.countOf(words_ns, s)
if n > 0:
word_freq.append([s])
word_freq.append([n])
print(url2,' frequency= ',word_freq)
url3 = url2
del words_ns
del phword
del word_freq
gc.collect()
else:
url2 = url3 + url2
r = requests.get(url2)
type(r)
html = r.text
soup = BeautifulSoup(html, "html5lib")
type(soup)
text = soup.get_text()
words = re.findall('\w+', text)
sw = nltk.corpus.stopwords.words('english')
words_ns = []
for word in words:
if word not in sw:
words_ns.append(word.lower())
word_freq = []
for s in phword:
n = operator.countOf(words_ns, s)
if n > 0:
word_freq.append([s])
word_freq.append([n])
print(url2,' frequency= ',word_freq)
del words_ns
del phword
del word_freq
gc.collect()