Skip to content

Commit

Permalink
UPDATE: #11 - Retry parse when exception raised (MAX Attempt: 3)
Browse files Browse the repository at this point in the history
  • Loading branch information
leegeunhyeok committed Feb 5, 2020
1 parent 4bfbf5f commit b19220b
Showing 1 changed file with 75 additions and 61 deletions.
136 changes: 75 additions & 61 deletions src/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@
from src.util import extract_date, to_valid_filename, update_size, clean_text

class Parser:
# ์˜ค๋ฅ˜ ๋ฐœ์ƒ ์‹œ ์žฌ์‹œ๋„ํ•  ํšŸ์ˆ˜
__ATTEMPT__ = 3

def __init__(self, chromedriver, cookie, logger, wait, delay, \
headless, options):
self._chromedriver = chromedriver
Expand All @@ -45,10 +48,10 @@ def parse(self, content_list, image_list, feeder_running, parser_running):
name = current_process().name
self._logger.info(name, 'ํฌ๋กฌ ๋“œ๋ผ์ด๋ฒ„ ๋กœ๋”ฉ ์ค‘..')

if self._headless:
parser_driver = webdriver.Chrome(self._chromedriver, chrome_options=self._options)
else:
parser_driver = webdriver.Chrome(self._chromedriver)
parser_driver = webdriver.Chrome(
self._chromedriver,
chrome_options=self._options
)

parser_driver.implicitly_wait(self._wait)
parser_driver.get('https://cyworld.com')
Expand All @@ -58,63 +61,74 @@ def parse(self, content_list, image_list, feeder_running, parser_running):
self._logger.info(name, 'ํฌ๋กฌ ๋“œ๋ผ์ด๋ฒ„ ๋กœ๋”ฉ ์™„๋ฃŒ')

while feeder_running.value or len(content_list) != 0:
try:
if len(content_list) != 0:
# ๊ณต์œ  ๋ฆฌ์ŠคํŠธ์—์„œ ๊ฒŒ์‹œ๋ฌผ URL ์ถ”์ถœ ๋ฐ ์ ‘์†
target_url = content_list.pop(0)
self._logger.info(name, target_url)
parser_driver.get(target_url)

# ํ•„์š”ํ•œ ๋ฐ์ดํ„ฐ ์ถ”์ถœ
date = parser_driver \
.find_element_by_css_selector('div.view1 p')
images = parser_driver \
.find_elements_by_css_selector('section.imageBox')
texts = parser_driver \
.find_elements_by_css_selector('section.textBox')

# ์›๋ณธ ์ œ๋ชฉ
title = parser_driver \
.find_element_by_id('cyco-post-title') \
.get_attribute('innerText')

# ํŒŒ์ผ ์ €์žฅ์„ ์œ„ํ•ด ์ „์ฒ˜๋ฆฌํ•œ ์ œ๋ชฉ (ํŒŒ์ผ๋ช…์œผ๋กœ ์‚ฌ์šฉ๋จ)
preprocessed_title = to_valid_filename(title)

# ๊ฒŒ์‹œ๊ธ€ ๋‚ ์งœ ์—…๋กœ๋“œ ๋‚ ์งœ
post_date = extract_date(date.get_attribute('innerText'))

# ๊ฒŒ์‹œ๊ธ€ ๋ฐ์ดํ„ฐ ๋ณ‘ํ•ฉ
post_text = '[ {} ]\n\n'.format(title)
for text in texts:
current_text = text.get_attribute('innerText').strip()

if len(current_text):
post_text += clean_text(current_text) + '\n'

# ์ด๋ฏธ์ง€ ๋ชฉ๋ก ์ถ”์ถœ
for image in images:
imgs = image.find_elements_by_tag_name('img')

for img in imgs:
src = update_size(img.get_attribute('src'))

image_list.append({
'title': preprocessed_title,
'date': post_date,
'content': post_text,
'src': src
})

self._logger.info(
name, '{}_{} ํฌ์ŠคํŠธ ํŒŒ์‹ฑ ๋จ'.format(
post_date, title)
)

# ์‹ธ์ด์›”๋“œ ์„œ๋ฒ„ ๋ถ€ํ•˜ ๋ฐฉ์ง€๋ฅผ ์œ„ํ•ด ์ž ์‹œ ๋Œ€๊ธฐ
time.sleep(1)
except Exception as e:
self._logger.error(str(e))
attempt = 0

while attempt < Parser.__ATTEMPT__:
attempt += 1

try:
if len(content_list) != 0:
# ๊ณต์œ  ๋ฆฌ์ŠคํŠธ์—์„œ ๊ฒŒ์‹œ๋ฌผ URL ์ถ”์ถœ ๋ฐ ์ ‘์†
target_url = content_list.pop(0)
self._logger.info(name, target_url)
parser_driver.get(target_url)

# ํ•„์š”ํ•œ ๋ฐ์ดํ„ฐ ์ถ”์ถœ
date = parser_driver \
.find_element_by_css_selector('div.view1 p')
images = parser_driver \
.find_elements_by_css_selector('section.imageBox')
texts = parser_driver \
.find_elements_by_css_selector('section.textBox')

# ์›๋ณธ ์ œ๋ชฉ
title = parser_driver \
.find_element_by_id('cyco-post-title') \
.get_attribute('innerText')

# ํŒŒ์ผ ์ €์žฅ์„ ์œ„ํ•ด ์ „์ฒ˜๋ฆฌํ•œ ์ œ๋ชฉ (ํŒŒ์ผ๋ช…์œผ๋กœ ์‚ฌ์šฉ๋จ)
preprocessed_title = to_valid_filename(title)

# ๊ฒŒ์‹œ๊ธ€ ๋‚ ์งœ ์—…๋กœ๋“œ ๋‚ ์งœ
post_date = extract_date(date.get_attribute('innerText'))

# ๊ฒŒ์‹œ๊ธ€ ๋ฐ์ดํ„ฐ ๋ณ‘ํ•ฉ
post_text = '[ {} ]\n\n'.format(title)
for text in texts:
current_text = text.get_attribute('innerText') \
.strip()

if len(current_text):
post_text += clean_text(current_text) + '\n'

# ์ด๋ฏธ์ง€ ๋ชฉ๋ก ์ถ”์ถœ
for image in images:
imgs = image.find_elements_by_tag_name('img')

for img in imgs:
src = update_size(img.get_attribute('src'))

image_list.append({
'title': preprocessed_title,
'date': post_date,
'content': post_text,
'src': src
})

self._logger.info(
name, '{}_{} ํฌ์ŠคํŠธ ํŒŒ์‹ฑ ๋จ'.format(
post_date, title)
)

# ์‹ธ์ด์›”๋“œ ์„œ๋ฒ„ ๋ถ€ํ•˜ ๋ฐฉ์ง€๋ฅผ ์œ„ํ•ด ์ž ์‹œ ๋Œ€๊ธฐ
time.sleep(1)
break
except IndexError:
break
except Exception as e:
time.sleep(3)
self._logger.error(str(e) + ' - Attempt({}/{})' \
.format(attempt, Parser.__ATTEMPT__))

parser_running.value = 0
parser_driver.close()
Expand Down

0 comments on commit b19220b

Please sign in to comment.