-
Notifications
You must be signed in to change notification settings - Fork 0
/
wuzzufAPI.py
148 lines (111 loc) · 5.01 KB
/
wuzzufAPI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from extractInfo import *
from searchQuery import *
from math import ceil
def get_single_page(desired_job, page_number):
"""get soup content from specific url
Args:
desired_job (String): the desired job title
page_number (Int): the number of the current page, starting with 0
Returns:
BeautifulSoup Object: contains parsed BeautifulSoup information
"""
# set a search query on "wuzzuf.net":
job_url = set_searchQuery(desired_job, page_number)
print(job_url)
# fetch the obrained URL, then get page content:
page_content = requests.get(job_url).content
# parse markup page content using "lxml" parser:
soup_content = BeautifulSoup(page_content, "lxml")
return soup_content
def fetch_data(desired_job, page_number=0):
"""extract specific data from a soup object created within function
Args:
desired_job (String): the desired job title
page_number (Int): the number of the current page, starting with 0
Returns:
tuple: extracted information
"""
# create empty lists for required data:
job_titles = list()
company_names = list()
company_locations = list()
posting_dates = list()
job_types = list()
career_levels = list()
years_of_experience = list()
job_links = list()
job_requirements = list()
# extract total number of jobs found:
soup_content = get_single_page(desired_job, page_number)
total_jobs = extract_totalJobNumber(soup_content)
# print number of jobs found:
print("Number of jobs found: {}".format(total_jobs))
# calculate stopping condition and get rounded up integer part:
total_pages = ceil(total_jobs / 15)
print("Total number of pages: {}\n".format(total_pages))
# print message for the user:
print("Start collecting data ...")
# loop through all pages:
while page_number < total_pages:
# get soup content for specific page:
soup_content = get_single_page(desired_job, page_number)
# # extract job titles:
single_page_titles = extract_jobTitles(soup_content)
# # extract company names:
single_page_companies = extract_companyNames(soup_content)
# # extract company locations:
single_page_locations = extract_companyLocations(soup_content)
# # extract job posting dates:
single_page_dates = extract_postingDates(soup_content)
# # extract job types:
single_page_job_types = extract_jobTypes(soup_content)
# # extract career levels:
single_page_career_levels = extract_careerLevels(soup_content)
# # extract years of experience:
single_page_experiences = extract_yearsOfExperience(soup_content)
# # extract job links:
single_page_links = extract_jobLinks(soup_content)
# # extract job requirements:
single_page_requirements = extract_jobRequirements(single_page_links)
# append the found results of the current page:
for i in range(len(single_page_titles)):
job_titles.append(single_page_titles[i])
company_names.append(single_page_companies[i])
company_locations.append(single_page_locations[i])
posting_dates.append(single_page_dates[i])
job_types.append(single_page_job_types[i])
career_levels.append(single_page_career_levels[i])
years_of_experience.append(single_page_experiences[i])
job_links.append(single_page_links[i])
job_requirements.append(single_page_requirements[i])
# get the next page:
page_number += 1
return job_titles, company_names, company_locations, \
posting_dates, job_types, career_levels, \
years_of_experience, job_links, job_requirements
def wuzzuf_api(desired_job, page_number=0):
"""construct a dictionary with the collected data
Args:
soup_content (BeautifulSoup object): contains parsed BeautifulSoup information
Returns:
dict: organized, well-formed data
"""
# empty dictionary for the final collected data:
data = dict()
# desired data to be collected:
job_titles, company_names, company_locations, \
posting_dates, job_types, career_levels, \
years_of_experience, job_links, job_requirements = fetch_data(desired_job, page_number)
# construct a dictionary for each job:
for i in range(len(job_titles)):
data[i] = dict()
data[i]["job_title"] = job_titles[i]
data[i]["company_name"] = company_names[i]
data[i]["company_location"] = company_locations[i]
data[i]["posting_date"] = posting_dates[i]
data[i]["job_types"] = job_types[i]
data[i]["career_level"] = career_levels[i]
data[i]["years_of_experience"] = years_of_experience[i]
data[i]["job_link"] = job_links[i]
data[i]["job_requirements"] = job_requirements[i]
return data