From f2964d52f2332e89380eb216ef9e299ecd1aecf1 Mon Sep 17 00:00:00 2001 From: andreireporter13 Date: Sat, 28 Sep 2024 12:55:34 +0300 Subject: [PATCH] added and deleted something --- .gitignore | 1 + old_sites/tesla_scraper.py | 102 -------------- sites/sustainalytics_scraper.py | 237 -------------------------------- test_file.py | 29 ---- 4 files changed, 1 insertion(+), 368 deletions(-) delete mode 100644 old_sites/tesla_scraper.py delete mode 100644 sites/sustainalytics_scraper.py delete mode 100644 test_file.py diff --git a/.gitignore b/.gitignore index 327df5c..2782ba8 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ /Bash_scrapers **/__pycache__ sites/000_delete_api_data.py +/hack_api_nemlig.py diff --git a/old_sites/tesla_scraper.py b/old_sites/tesla_scraper.py deleted file mode 100644 index 6893deb..0000000 --- a/old_sites/tesla_scraper.py +++ /dev/null @@ -1,102 +0,0 @@ -# -# -# -# This is script for Tesla! -# Link to tesla ---> https://www.tesla.com/ro_ro/careers -# -# https://www.tesla.com/ro_RO/careers/search/job -# (mai departe titlul prin - - - concatenat + id) -# -# Example - Title - Open Tech Day Bucharest - 27th April 2023 -# ---------> link - https://www.tesla.com/ro_RO/careers/search/job/open-tech-day-bucharest-27th-april-2023-179440 -# ---------> link - https://www.tesla.com/cua-api/apps/careers/state -# -from A_OO_get_post_soup_update_dec import DEFAULT_HEADERS, update_peviitor_api -from L_00_logo import update_logo -import requests -# -import uuid - - -def return_response_from_api(url: str, headers: dict): - """ - This func make a full request to tesla api. - """ - - response = requests.get(url=url, headers=DEFAULT_HEADERS).json() - - return response - - -def return_all_dict_data_jobs(): - """ - This func() return all nums from api, Romania. - """ - - data = return_response_from_api('https://www.tesla.com/cua-api/apps/careers/state', DEFAULT_HEADERS) - - data_lst = [] - ### - try: - all_l_nums = data['geo'][1]['sites'][22]['cities'] - for dl in all_l_nums: - data_lst.extend(all_l_nums[dl]) - except: - all_l_nums = data['geo'][1]['sites'][23]['cities'] - for dl in all_l_nums: - data_lst.extend(all_l_nums[dl]) - ### - - lst_with_dict_data = [] - for data in data['listings']: - cod = data['l'] - - if cod in data_lst: - lst_with_dict_data.append(data) - - return lst_with_dict_data - - -def return_links_with_jobs(): - """ - This func() return all jobs from this nums... - """ - list_dict = return_all_dict_data_jobs() - - list_for_pe_viitor = [] - for ide in list_dict: - link_1 = ide['t'].lower().replace('- ', '').split() - link_2 = str(ide['id']) - - title = ide['t'] - link_final = 'https://www.tesla.com/ro_RO/careers/search/job/' + '-'.join(link_1) + '-' + link_2 - - list_for_pe_viitor.append({ - "id": str(uuid.uuid4()), - "job_title": title, - "job_link": link_final, - "company": "Tesla", - "country": "Romania", - "city": "Romania" - }) - - return list_for_pe_viitor - - -# update data on peviitor! -@update_peviitor_api -def scrape_and_update_peviitor(company_name, data_list): - """ - Update data on peviitor API! - """ - - return data_list - - -company_name = 'Tesla' -data_list = return_links_with_jobs() -scrape_and_update_peviitor(company_name, data_list) - -print(update_logo('Tesla', - 'https://autostickere.ro/image/cache/catalog/Stickere/Stickere%20Auto/Stickere%20Marci/Tesla/sticker-tesla-text-1000x1000.jpg' - )) diff --git a/sites/sustainalytics_scraper.py b/sites/sustainalytics_scraper.py deleted file mode 100644 index 1241178..0000000 --- a/sites/sustainalytics_scraper.py +++ /dev/null @@ -1,237 +0,0 @@ -# -# -# Config for Dynamic Post Method -> For Json format! -# -# Company ---> Sustainalytics -# Link ------> https://careers.morningstar.com/sustainalytics/us/en/home -# -# ------ IMPORTANT! ------ -# if you need return soup object: -# you cand import from __utils -> GetHtmlSoup -# if you need return regex object: -# you cand import from __utils -> -# ---> get_data_with_regex(expression: str, object: str) -# -# -from __utils import ( - PostRequestJson, - # - GetRequestJson, # for another locations need to make GetRequestJson - # - get_county, - get_job_type, - Item, - UpdateAPI, - # - GetHeadersDict, - # - counties, -) -# -import re -from typing import Union - - -def make_dict_jobs_dict(title: str, - job_link: str, - county: Union[str, list], - city: Union[str, list]): - ''' - >>> >>> >>> this function make dict for SustainAnalytics API ---> - - params: title: str, job_link: str, county: str, city: str - return: dict[str, str] - ''' - - # get jobs items from response - return Item( - job_title=title, - job_link=job_link, - company='Sustainalytics', - country='Romania', - county=county, - city=city, - remote=['on-site', 'remote'], - ).to_dict() - - -def get_special_keys(): - ''' - >>> >>> >>> Get special keys from MyWorkDayJobs ---> Sustainanalytics <--- - - params: None - return: play_session: str, - wday_vps_cookie: str, - __cflb: str, - wd_browser_id: str, - calypso_csrf_token: str, - ''' - - str_headers = str(GetHeadersDict('https://morningstar.wd5.myworkdayjobs.com/en-US/Sustainalytics/jobs?locations=0e19b52288b5019ff735e44eda00fe40&locations=0e19b52288b501a1c52ad54eda00f640')) - - new_dict = dict(wday_vps_cookie=r'wday_vps_cookie=([^;]+)', - PLAY_SESSION='PLAY_SESSION=([^;]+)', - wd_broser_id='wd-browser-id=([^;]+)', - CALYPSO_CSRF_TOKEN='CALYPSO_CSRF_TOKEN=([^;]+)') - - # catch data from requests and store it in new dict - regex_dict = dict() - for key, value in new_dict.items(): - match = re.search(value, str_headers) - if match: - regex_form = match.group(1) - regex_dict[key] = regex_form - - return regex_dict - - -# all of this keys need to store in one Session # # # SESION KEYS FOR COOKIES # # # -keys_dict_ids = get_special_keys() - - -def get_dynamic_headers(all_jobs: str=None, one_job_info: str=None): - ''' - >>> >>> >>> get dynamic headers to scrape MyWorkDayJobs API ---> - >>> >>> >>> But, params need to be one True and one False, not both True - - params: all_jobs: str=None, one_job_info: str=None - return: url: str, headers: dict, payload: json - ''' - - # check if params not both True - url = None - if all_jobs is not None: - url = f'https://morningstar.wd5.myworkdayjobs.com/wday/cxs/morningstar/Sustainalytics{all_jobs}' - elif one_job_info is not None: - url = f'https://morningstar.wd5.myworkdayjobs.com/wday/cxs/morningstar/Sustainalytics{one_job_info}' - else: - raise ValueError("'all_jobs' and 'one_job_info' can not be True both.") - - headers = { - 'authority': 'morningstar.wd5.myworkdayjobs.com', - 'accept': 'application/json', - 'accept-language': 'en-US', - 'content-type': 'application/json', - 'cookie': f"PLAY_SESSION={keys_dict_ids.get('PLAY_SESSION')}; wday_vps_cookie={keys_dict_ids.get('wday_vps_cookie')}; timezoneOffset=-120; wd-browser-id={keys_dict_ids.get('wd_broser_id')}; CALYPSO_CSRF_TOKEN={keys_dict_ids.get('CALYPSO_CSRF_TOKEN')};", - 'origin': 'https://morningstar.wd5.myworkdayjobs.com', - 'referer': f'https://morningstar.wd5.myworkdayjobs.com/en-US/Sustainalytics{"/jobs?locations=0e19b52288b5019ff735e44eda00fe40&locations=0e19b52288b501a1c52ad54eda00f640" if all_jobs is not None else one_job_info }', - 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36', - 'x-calypso-csrf-token': f"{keys_dict_ids.get('CALYPSO_CSRF_TOKEN')}", - } - - json_data = { - 'appliedFacets': { - 'locations': [ - '0e19b52288b5019ff735e44eda00fe40', - '0e19b52288b501a1c52ad54eda00f640', - ], - }, - 'limit': 20, - 'offset': 0, - 'searchText': '', - } - - # - if all_jobs is not None: - return url, headers, json_data - else: - return url, headers - - -def scraper(): - ''' - ... scrape data from Sustainalytics scraper. - ''' - - url, headers, json_data = get_dynamic_headers(all_jobs='/jobs') # need scrape all_jobs - - post_data = PostRequestJson(url=url, custom_headers=headers, data_json=json_data) - - job_list = [] - for job in post_data.get('jobPostings'): - - location: Union[str, list] = None - if (location := job.get('locationsText').lower()) == 'bucharest': - location = 'bucuresti' - elif '2 locations' == location: - - # # # # # HERE MAKE ANOTHER REQUEST TO JOB PAGE INFO AND SCRAPE ACCURATE LOCATION # # # # # # - # Make headers for - url_get, headers_get = get_dynamic_headers(one_job_info=job.get('externalPath')) - json_get_data = GetRequestJson(url=url_get, custom_headers=headers_get) - - # - location = list() - loc = None - if (loc := json_get_data.get('jobPostingInfo').get('location').lower()) == 'bucharest' or loc == 'timisoara': - if loc == 'bucharest': - loc = 'bucuresti' - - # to lower() - loc = loc.lower() - # - location.append(loc) - - # additional locations - for job_sec in json_get_data.get('jobPostingInfo').get('additionalLocations'): - new_loc = None - for search_city in counties: - for v in search_city.values(): - for ccity in v: - if re.search(r'\b{}\b'.format(re.escape(ccity.split()[-1].lower())), job_sec.lower()): - new_loc = ccity.lower() - break - - if new_loc: - location.append(new_loc) - - # - if type(location).__name__ == 'str': - # - location_finish = get_county(location=location) - # - job_list.append(make_dict_jobs_dict(title=job.get('title'), - job_link=f"https://morningstar.wd5.myworkdayjobs.com/en-US/Sustainalytics{job.get('externalPath')}", - county=location_finish[0] if True in location_finish else None, - city=location.title())) - # - elif type(location).__name__ == 'list' and len(location) == 1: - # - location_finish = get_county(location=location[0]) - # - job_list.append(make_dict_jobs_dict(title=job.get('title'), - job_link=f"https://morningstar.wd5.myworkdayjobs.com/en-US/Sustainalytics{job.get('externalPath')}", - county=location_finish[0] if True in location_finish else None, - city=location[0].title())) - # - elif type(location).__name__ == 'list' and len(location) > 1: - # - location_finish = [get_county(location=city_jud) for city_jud in location] - get_locations_with_none = [location_finish[0] if True in location_finish else None for location_finish in location_finish] - # - job_list.append(make_dict_jobs_dict(title=job.get('title'), - job_link=f"https://morningstar.wd5.myworkdayjobs.com/en-US/Sustainalytics{job.get('externalPath')}", - county=None if None in get_locations_with_none else get_locations_with_none, - city=[xx.title() for xx in location])) - # - return job_list - - -def main(): - ''' - ... Main: - ---> call scraper() - ---> update_jobs() and update_logo() - ''' - - company_name = "Sustainalytics" - logo_link = "https://www.sgs.com/-/media/sgscorp/images/infographics-and-charts/sustainalytics-logo.cdn.en-US.1.png" - - jobs = scraper() - - # uncomment if your scraper done - UpdateAPI().update_jobs(company_name, jobs) - UpdateAPI().update_logo(company_name, logo_link) - -if __name__ == '__main__': - main() diff --git a/test_file.py b/test_file.py deleted file mode 100644 index e22db4f..0000000 --- a/test_file.py +++ /dev/null @@ -1,29 +0,0 @@ -# -# -# -#import requests - -# ---> Daily update -# cookies = { -# 'PHPSESSID': 'sxi2PjU4lVYHwwU83juWfnNwS4VMrrHJ', -# '_cfuvid': '6nf6PvOXmaHQJN4so.LtllqhAS37tR9qtdujabj7RxY-1726315836551-0.0.1.1-604800000', -# } - -# headers = { -# 'accept': 'application/json, text/plain, */*', -# 'accept-language': 'en-US,en;q=0.9', -# # 'cookie': 'PHPSESSID=sxi2PjU4lVYHwwU83juWfnNwS4VMrrHJ; _cfuvid=6nf6PvOXmaHQJN4so.LtllqhAS37tR9qtdujabj7RxY-1726315836551-0.0.1.1-604800000', -# 'priority': 'u=1, i', -# 'referer': 'https://skywindgroup.bamboohr.com/careers/', -# 'sec-ch-ua': '"Chromium";v="128", "Not;A=Brand";v="24", "Google Chrome";v="128"', -# 'sec-ch-ua-mobile': '?0', -# 'sec-ch-ua-platform': '"Linux"', -# 'sec-fetch-dest': 'empty', -# 'sec-fetch-mode': 'cors', -# 'sec-fetch-site': 'same-origin', -# 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36', -# } - -# response = requests.get('https://skywindgroup.bamboohr.com/careers/list', headers=headers) - -# print(response.json())