Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions homework_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#task8
import pandas
import numpy
import matplotlib.pyplot as plt

works = pandas.read_csv("works.csv").dropna()
def count(field1, field2, works):
result = 0
for f1, f2 in zip(works[field1], works[field2]):
if not match(f1, f2) and not match(f2, f1):
result += 1
return result

def match(f1, f2):
array = f1.lower().replace('-', ' ').split()
for word in array:
if word in f2.lower():
return True
return False


result = count("jobTitle", "qualification", works)
print("Из {} людей не совпадают профессия и должность у {}".format(works.shape[0], result))

print("\nТоп образований людей для менеджеров")
print(works[works['jobTitle'].str.lower().str.contains('менеджер'[:-2])]['qualification'].str.lower().value_counts().head(5))

print("\nТоп должностей людей, которые по диплому являются инженерами")
print(works[works['jobTitle'].str.lower().str.contains('инженер'[:-2])]['qualification'].str.lower().value_counts().head(5))
13 changes: 13 additions & 0 deletions report.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
1.Профессия и должность не совпадают у 793 человек.
2.Топ-5 образований для менеджеров:
бакалавр 11
менеджер 10
специалист 6
экономист 6
экономист-менеджер 4
Топ-5 профессий людей, получивших диплом инженера:
заместитель директора 3
главный инженер 3
ведущий инженер-конструктор 2
инженер лесопользования 2
директор 2
75 changes: 75 additions & 0 deletions task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import pandas
import numpy
import matplotlib.pyplot as plt

works = pandas.read_csv("works.csv")
print(works['skills'].str.lower().str.contains('python|питон'))

#task1
works = pandas.read_csv("works.csv")
head = works.head(5)
print(head)

tail = works.tail(5)
print(tail)
print(works.shape[0])
print(len(works.index))

#task2
print(works[works['gender'] == 'Мужской'].shape[0])
print((works['gender'] == 'Женский').sum())
print(works['gender'].value_counts())

#task3
print(works['skills'].notnull().sum())
print(works.info())
print(works['skills'].count())

#task4
print(works[works['skills'].notnull()]['skills'])
print(works['skills'].dropna())
print(works.query("skills == skills")["skills"])
print(works.query("salary == 15000"))
edu = 'Высшее'
gen = 'Женский'
print(works.query("educationType == @edu and gender == @gen")[['salary', 'educationType','gender']])

#task5
mask = works["skills"].str.lower().str.contains("python|питон") & works["skills"].notnull()
print(works[mask]["salary"])

#task6
percentiles = numpy.linspace(.1, 1, 10)

gen = "Мужской"
men_salary = works.query('gender == @gen').quantile(percentiles)
fig, ax = plt.subplots()
ax.plot(percentiles, men_salary)
plt.xlabel('Перцентили')
plt.ylabel('Зарплата мужчин')
plt.show()

gen = "Женский"
women_salary = works.query('gender == @gen').quantile(percentiles)
fig, ax = plt.subplots()
ax.plot(percentiles, women_salary)
plt.xlabel('Перцентили')
plt.ylabel('Зарплата женщин')
plt.show()

#task7
gen = "Мужской"
men_salary = works.query('gender == @gen').groupby("educationType").agg("mean").reset_index()
men = men_salary['salary'].values
gen = "Женский"
women_salary = works.query('gender == @gen').groupby("educationType").agg("mean").reset_index()
women = women_salary['salary'].values

types = men_salary["educationType"].values
id = numpy.arange(len(types))

plt.bar(id - 0.2, men, 0.4, color="g", label = "Средняя зарплата мужчин")
plt.bar(id + 0.2, women, 0.4, color="y", label = "Средняя зарплата женщин")
plt.xticks(id, types, rotation=45)
plt.legend()
plt.show()