Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
cache
.idea
14 changes: 8 additions & 6 deletions hh_web_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import time

# достает html код по указанной ссылке
def get_html(url):
def get_html(url):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
rq = requests.get(url, headers=headers)
print('Gettin HTML-code from ', url)
Expand All @@ -15,7 +15,7 @@ def get_html(url):
# проверяет, есть ли на странице ссылки на вакансии
def is_empty(html):
soup = BeautifulSoup(html, 'lxml')
links = soup.find_all('a', class_='search-result-item__name')
links = soup.find_all('a', class_='HH-LinkModifier')
if links == []:
return True
else:
Expand Down Expand Up @@ -57,7 +57,7 @@ def get_offers_links(html, all_links):
# новый объект класса BeutifulSoup
soup = BeautifulSoup(html, 'lxml')

links = soup.find_all('a', class_='search-result-item__name')
links = soup.find_all('a', class_='HH-LinkModifier')
for link in links:
link_parsed = link.get('href').split('?')
all_links.append(link_parsed[0])
Expand All @@ -82,7 +82,7 @@ def parse_skills_in_offer(soup, skill_dict):
# функция, которая парсит блок с описанием вакансии и возвращает дополненный словарь, который ей дали на входе
def parse_description_in_offer(soup, description_dict):
# описание вакансии
description = soup.find('div', class_='b-vacancy-desc-wrapper')
description = soup.find('div', class_="vacancy-section")
# оставим только текст без тегов
text = ''.join(description.findAll(text=True))
# почистим текст от знаков препинания
Expand Down Expand Up @@ -124,7 +124,9 @@ def get_and_save_area_codes():
def parse_offers(links):
skill_dict = {}
description_dict = {}
for link in links:
for i in range(len(links)):
link = links[i]
print('Скачано ', i, ' из ', len(links))
html = get_html(link)
time.sleep(.3)
soup = BeautifulSoup(html, 'lxml')
Expand All @@ -146,7 +148,7 @@ def parse_offers(links):

if __name__ == '__main__':
query = 'python'
area = '113'
area = '2'
# сначала вытащим все ссылки на вакансии по данному запросу и региону
links = get_all_offers_links(query, area)
# теперь распарсим информацию по каждой ссылке, полученной выше
Expand Down