-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
149 lines (120 loc) · 5.23 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import bot
import pandas as pd
from enum import Enum
from time import sleep
from modules import latimes
from selenium.webdriver.common.by import By
from robocorp.tasks import task
from RPA.Robocorp.WorkItems import WorkItems
class Locators(Enum):
"""Class of the main element locators usd on the website"""
SEARCH_BTN = "//*[@data-element='search-button']"
SEARCH_INPUT = "//*[@data-element='search-form-input']"
SUBMIT_SEARCH = "//*[@data-element='search-submit-button']"
SORT_BY_SELECT = "select.select-input"
SEARCH_FILTER_TOPIC = "ul.search-filter-menu"
UL_NEWS = "ul.search-results-module-results-menu"
@task
def main():
"""Main flow
Performs the necessary steps to automate the collection of news from the LaTimes site
"""
# Get work item variable
wi = WorkItems()
wi.get_input_work_item()
input_wi = wi.get_work_item_variables()
search_phrase = input_wi.get("search_phrase", "Economic")
topic_search = input_wi.get("topic_search", "Business")
# Starts the browser and accesses the website
browser = bot.browser.Edge()
browser.searchBrowser("https://www.latimes.com")
pageTitle = browser.title
bot.logger.info(f"Current Page Title: '{ pageTitle }'")
# Click on the search button
element = browser.find_element("xpath", Locators.SEARCH_BTN)
assert element, f"Element '{ element }' not found"
element.click()
# Focuses on the search field and typing phrase
element = browser.find_element("xpath", Locators.SEARCH_INPUT)
assert element, f"Element '{ element }' not found"
element.send_keys(search_phrase)
# Click on the submit search button
element = browser.find_element("xpath", Locators.SUBMIT_SEARCH)
assert element, f"Element '{ element }' not found"
element.click()
# Waits for the page to redirect to the search page
bot.utils.wait_condition(
condition = lambda: browser.title != pageTitle,
timeout = 15,
error = TimeoutError
)
# Search page
searchPageUrl = browser.url
# Filtering by topic
element = browser.find_element("css selector", Locators.SEARCH_FILTER_TOPIC)
assert element, f"Element '{ element }' not found"
topics = element.find_elements(By.TAG_NAME, "li")
filtered = False
for item in topics:
topic = item.find_element(By.CSS_SELECTOR, "label span").text
if topic_search in topic:
checkbox = item.find_element(By.CSS_SELECTOR, "input[type='checkbox']")
if not checkbox.is_selected():
checkbox.click()
filtered = True
break
if not filtered: bot.logger.info(f"No topic matches the term '{ topic_search }'")
# Filters the 'Sort By' field to display the latest news
element = browser.awaits_presence_element("css selector", Locators.SORT_BY_SELECT)
assert element, f"Element '{ element }' not found"
browser.select_option(element, "Newest")
sleep(2)
bot.utils.wait_condition(
condition = lambda: browser.url != searchPageUrl,
timeout = 10,
error = TimeoutError
)
# Get data from the news list
element = browser.awaits_presence_element("css selector", Locators.UL_NEWS)
assert element, f"Element '{ element }' not found"
sleep(2)
# List to save news data
news_data = []
news = element.find_elements(By.TAG_NAME, "li")
# Get title, date, description and picture filename
for item in news:
# Initialize values as None
title = None
date = None
description = None
try: title = item.find_element(By.CSS_SELECTOR, "h3.promo-title").text
except Exception as error:
bot.logger.error(f"Error while obtaining the news title: {error}")
try: date = item.find_element(By.CSS_SELECTOR, "p.promo-timestamp").get_attribute("data-timestamp")
except Exception as error:
bot.logger.error(f"Error while obtaining the news date: {error}")
try: description = item.find_element(By.CSS_SELECTOR, "p.promo-description").text
except Exception as error:
bot.logger.error(f"Error while obtaining the news description: {error}")
try:
picture_url = item.find_element(By.CSS_SELECTOR, "img.image").get_attribute("src")
picture_filename = bot.utils.filename_from_url(picture_url, ".jpg")
except Exception as error:
bot.logger.error(f"Unable to download news image ('{ title }') . It probably doesn't have an image")
picture_filename = "No image"
news_data.append ({
"Title": title,
"Date": bot.utils.timestamp_to_date(float(date)),
"Description": description,
"Picture filename": picture_filename,
"Count search phrase": bot.utils.count_phrase_in_context(search_phrase, [title, description]),
"Contains Amount": latimes.amount_checker([title, description])
})
browser.download_file(picture_url, alt_extension=".jpg")
df = pd.DataFrame(news_data)
df.to_excel("./output/news.xlsx", index=False)
if __name__ == "__main__":
try:
main()
except Exception as error:
bot.logger.error(f"Unexpected error in the flow: { error }")