-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
130 lines (104 loc) · 5.13 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#-*- coding: utf-8 -*-
from openpyxl import Workbook
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re, os
import multiprocessing
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
path = BASE_DIR + "/down/chromedriver.exe"
blankRemovePattern = re.compile(r'\s+')
mapExtractPattern = re.compile(r'\((.*?)\)')
siteExtractPattern = re.compile(r'\'(.*?)\'')
ogNameDic = {}
def getList(indexId):
urlPath = "http://e-childschoolinfo.moe.go.kr/kinderMt/combineFind.do?pageIndex=%d" % indexId
driver.get(urlPath)
driver.implicitly_wait(10)
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
resultCount = soup.select_one('#noticeSearch > div.content > p.tblResult > span')
print(resultCount.text.strip())
resultTables = soup.select('#noticeSearch > div > table > tbody > tr')
lineString = '================================================================='
for idx, tr in enumerate(resultTables):
# 엑셀 행값
rowIndex = (indexId - 1) * 10 + idx
print(lineString)
tdObjects = tr.select('td');
for idx, td in enumerate(tdObjects):
if idx < 1:
continue
elif idx == 1:
print('기관 유형 : %s' % td.text.strip())
sheet1.cell(row=rowIndex, column=1).value = td.text.strip()
elif idx == 2:
value = td.select_one('a')
matchString = mapExtractPattern.search(value['href']).group();
matchString = matchString[1: len(matchString) - 1]
splitMatchString = matchString.split(',');
# 기관 아이디
ogId = splitMatchString[1].strip()[1: len(splitMatchString[1].strip()) - 1]
# 기관 유형 코드 : 01 - 유치원, 02 - 어린이집
ogType = splitMatchString[2].strip()[1: len(splitMatchString[2].strip()) - 1]
# 기관명
ogName = re.sub(blankRemovePattern, " ", value.text.strip())
print('기관명 : %s' % re.sub(blankRemovePattern, " ", value.text.strip()))
print('기관코드 : %s' % splitMatchString[1].strip()[1: len(splitMatchString[1].strip()) - 1])
print('기관유형 : %s' % splitMatchString[2].strip()[1: len(splitMatchString[2].strip()) - 1])
sheet1.cell(row=rowIndex, column=2).value = ogId
sheet1.cell(row=rowIndex, column=3).value = ogName
sheet1.cell(row=rowIndex, column=4).value = ogType
# 유치원인 경우에 기관 정보를 입력
if ogType == '01':
ogNameDic[ogId] = urlPath
elif idx == 3:
print('설립 유형 : %s' % td.text.strip())
sheet1.cell(row=rowIndex, column=5).value = td.text.strip()
elif idx == 4:
print('기관 주소 : %s' % td.text.strip())
sheet1.cell(row=rowIndex, column=6).value = td.text.strip()
elif idx == 5:
# 위도 : Latitude, 경도 : Longitude
value = td.select_one('a')
matchString = mapExtractPattern.search(value['href']).group();
matchString = matchString[1: len(matchString) - 1]
splitMatchString = matchString.split(',');
print('Latitude : %s, Longitude : %s' % (splitMatchString[1].strip(), splitMatchString[2].strip()))
sheet1.cell(row=rowIndex, column=7).value = splitMatchString[1].strip()
sheet1.cell(row=rowIndex, column=8).value = splitMatchString[2].strip()
elif idx == 6:
value = td.select_one('a')
if value is None:
continue
else:
matchString = siteExtractPattern.search(value['href']).group();
matchString = matchString[1: len(matchString) - 1]
print('사이트 주소 : %s' % matchString)
sheet1.cell(row=rowIndex, column=9).value = matchString
print(lineString)
def getDetailInfo(key, urlPath):
driver.get(urlPath)
driver.implicitly_wait(10)
elem = driver.find_element_by_css_selector("[href*='%s']" % key)
elem.click()
if __name__ == "__main__":
options = webdriver.ChromeOptions()
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36")
# 혹은 options.add_argument("--disable-gpu")
driver = webdriver.Chrome(path, chrome_options=options)
wb = Workbook()
sheet1 = wb.active
fileName = '20181211_APP_BACK_DATA.xlsx'
sheet1.title = '유치원어린이집목록'
for idx in range(1, 2):
getList(idx)
wb.save(filename=fileName)
for key, urlPath in ogNameDic.items():
getDetailInfo(key, urlPath)
driver.close()
print(ogNameDic)
print("MultiProcessing CPU Count : %d" % multiprocessing.cpu_count())