diff --git a/homework_task.py b/homework_task.py new file mode 100644 index 0000000..cd7a4cd --- /dev/null +++ b/homework_task.py @@ -0,0 +1,29 @@ +#task8 +import pandas +import numpy +import matplotlib.pyplot as plt + +works = pandas.read_csv("works.csv").dropna() +def count(field1, field2, works): + result = 0 + for f1, f2 in zip(works[field1], works[field2]): + if not match(f1, f2) and not match(f2, f1): + result += 1 + return result + +def match(f1, f2): + array = f1.lower().replace('-', ' ').split() + for word in array: + if word in f2.lower(): + return True + return False + + +result = count("jobTitle", "qualification", works) +print("Из {} людей не совпадают профессия и должность у {}".format(works.shape[0], result)) + +print("\nТоп образований людей для менеджеров") +print(works[works['jobTitle'].str.lower().str.contains('менеджер'[:-2])]['qualification'].str.lower().value_counts().head(5)) + +print("\nТоп должностей людей, которые по диплому являются инженерами") +print(works[works['jobTitle'].str.lower().str.contains('инженер'[:-2])]['qualification'].str.lower().value_counts().head(5)) \ No newline at end of file diff --git a/report.md b/report.md new file mode 100644 index 0000000..1a450b4 --- /dev/null +++ b/report.md @@ -0,0 +1,13 @@ +1.Профессия и должность не совпадают у 793 человек. +2.Топ-5 образований для менеджеров: +бакалавр 11 +менеджер 10 +специалист 6 +экономист 6 +экономист-менеджер 4 +Топ-5 профессий людей, получивших диплом инженера: +заместитель директора 3 +главный инженер 3 +ведущий инженер-конструктор 2 +инженер лесопользования 2 +директор 2 \ No newline at end of file diff --git a/task.py b/task.py new file mode 100644 index 0000000..06d2ca4 --- /dev/null +++ b/task.py @@ -0,0 +1,75 @@ +import pandas +import numpy +import matplotlib.pyplot as plt + +works = pandas.read_csv("works.csv") +print(works['skills'].str.lower().str.contains('python|питон')) + +#task1 +works = pandas.read_csv("works.csv") +head = works.head(5) +print(head) + +tail = works.tail(5) +print(tail) +print(works.shape[0]) +print(len(works.index)) + +#task2 +print(works[works['gender'] == 'Мужской'].shape[0]) +print((works['gender'] == 'Женский').sum()) +print(works['gender'].value_counts()) + +#task3 +print(works['skills'].notnull().sum()) +print(works.info()) +print(works['skills'].count()) + +#task4 +print(works[works['skills'].notnull()]['skills']) +print(works['skills'].dropna()) +print(works.query("skills == skills")["skills"]) +print(works.query("salary == 15000")) +edu = 'Высшее' +gen = 'Женский' +print(works.query("educationType == @edu and gender == @gen")[['salary', 'educationType','gender']]) + +#task5 +mask = works["skills"].str.lower().str.contains("python|питон") & works["skills"].notnull() +print(works[mask]["salary"]) + +#task6 +percentiles = numpy.linspace(.1, 1, 10) + +gen = "Мужской" +men_salary = works.query('gender == @gen').quantile(percentiles) +fig, ax = plt.subplots() +ax.plot(percentiles, men_salary) +plt.xlabel('Перцентили') +plt.ylabel('Зарплата мужчин') +plt.show() + +gen = "Женский" +women_salary = works.query('gender == @gen').quantile(percentiles) +fig, ax = plt.subplots() +ax.plot(percentiles, women_salary) +plt.xlabel('Перцентили') +plt.ylabel('Зарплата женщин') +plt.show() + +#task7 +gen = "Мужской" +men_salary = works.query('gender == @gen').groupby("educationType").agg("mean").reset_index() +men = men_salary['salary'].values +gen = "Женский" +women_salary = works.query('gender == @gen').groupby("educationType").agg("mean").reset_index() +women = women_salary['salary'].values + +types = men_salary["educationType"].values +id = numpy.arange(len(types)) + +plt.bar(id - 0.2, men, 0.4, color="g", label = "Средняя зарплата мужчин") +plt.bar(id + 0.2, women, 0.4, color="y", label = "Средняя зарплата женщин") +plt.xticks(id, types, rotation=45) +plt.legend() +plt.show() \ No newline at end of file