-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.py
99 lines (95 loc) · 3.85 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
import pandas as pd
import os
clg_dropdown_list_options = []
branch_dropdown_list_options = []
driver = webdriver.Firefox()
driver.get("https://tgeapcet.nic.in")
while driver.current_url != "https://tgeapcet.nic.in/default.aspx":
sleep(2)
while True:
try:
clg_allotment_hplink = driver.find_element(By.LINK_TEXT, "College-wise Allotment Details")
clg_allotment_hplink.click()
break
except:
continue
sleep(1)
driver.switch_to.window(driver.window_handles[1])
while len(clg_dropdown_list_options) <= 0:
sleep(2)
clg_dropdown_list = driver.find_element(By.ID,"MainContent_DropDownList1")
clg_dropdown_list_options = [college.text for college in clg_dropdown_list.find_elements(By.TAG_NAME, "option")]
print("Site Setup Finished, Starting now.")
if not os.path.exists("cache"):
os.makedirs("cache")
for college in reversed(clg_dropdown_list_options):
while True:
try:
clg_dropdown_list = driver.find_element(By.ID,"MainContent_DropDownList1")
clg_sel = Select(clg_dropdown_list)
clg_sel.select_by_visible_text(college)
branch_dropdown_list_options = []
print("Selected College:",college)
break
except:
sleep(.5)
continue
while len(branch_dropdown_list_options) <= 0:
branch_dropdown_list = driver.find_element(By.ID,"MainContent_DropDownList2")
branch_dropdown_list_options = [branch.text for branch in branch_dropdown_list.find_elements(By.TAG_NAME, "option")]
sleep(1)
for branch in branch_dropdown_list_options:
if branch == "--Select Branch--":
continue
if os.path.exists('cache/'+college+"::"+branch+".csv"):
print("Cache Hit, Skipping ⚠️")
continue
while True:
try:
branch_dropdown_list = driver.find_element(By.ID,"MainContent_DropDownList2")
branch_sel = Select(branch_dropdown_list)
branch_sel.select_by_visible_text(branch)
print("Processing College:",college,"| Branch:",branch)
break
except:
clg_dropdown_list = driver.find_element(By.ID,"MainContent_DropDownList1")
clg_sel = Select(clg_dropdown_list)
clg_sel.select_by_visible_text(college)
print("Reelected College to escape error:",college)
sleep(.5)
continue
show_allot = driver.find_element(By.ID,"MainContent_btn_allot")
show_allot.click()
sleep(2)
soup = BeautifulSoup(driver.page_source, "html.parser")
try:
table = soup.find_all("table")[-1]
except IndexError:
print("No Seats Allocated in College or Errored out")
driver.find_element(By.XPATH,"/html/body/right/a/img").click()
continue
rows = [[cell.text.strip() for cell in row.find_all("td")] for row in table.find_all("tr")]
_ = pd.DataFrame(rows)
del _[0]
_.columns = ["Hall Ticket No","Rank","Name of the Candidate","Sex","Caste","Region","Seat Category"]
_["Branch"] = branch
_["College"] = college
_.to_csv(f"cache/{college}::{branch.replace("/","|")}.csv")
del _
print("Done Processing College:",college,"| Branch:",branch,"✨")
print("Done scraping, Building final result")
result = pd.DataFrame()
for dat in os.listdir("cache"):
newdat = pd.read_csv("cache/"+dat.replace("/","|"),index_col=0)
result = pd.concat([result,newdat],ignore_index=True)
del newdat
os.remove("cache/"+dat.replace("/","|"))
result.to_csv("result.csv")
os.rmdir("cache")
print("Done")
driver.quit()