diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..723ef36 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.idea \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/homework.py b/homework.py new file mode 100644 index 0000000..3480ec4 --- /dev/null +++ b/homework.py @@ -0,0 +1,38 @@ +import pandas as pd + + +def non_matches(firs_param, second_param, data): + count = 0 + for (f1, f2) in zip(data[firs_param], data[second_param]): + if not is_contains(f1, f2) and not is_contains(f2, f1): + count += 1 + return count + + +def is_contains(first_field, second_field): + for word in first_field.lower().replace('-', ' ').split(): + if word in second_field.lower(): + return True + return False + + +def get_top(size, data, search_field, return_field, word_to_search): + return data[data[search_field].str.lower().str.contains(word_to_search[:-2])][return_field]\ + .str\ + .lower()\ + .value_counts()\ + .head(size) + + +works = pd.read_csv("works.csv").dropna() +not_matches_count = non_matches("jobTitle", "qualification", works) +managers = get_top(5, works, "jobTitle", "qualification", "менеджер") +engineers = get_top(5, works, "qualification", "jobTitle", "инженер") +output_string = f"Всего записей {works.shape[0]} из них не совпадают {not_matches_count}\n\n" \ + f"Топ - 5 образовний менеджеров\n" \ + f"{managers}\n\n" \ + f"Топ - 5 должностей инженеров\n" \ + f"{engineers}" + +with open('homework_output.txt', 'w', encoding='utf-8') as file: + file.write(output_string) diff --git a/homework_output.txt b/homework_output.txt new file mode 100644 index 0000000..8afc8ea --- /dev/null +++ b/homework_output.txt @@ -0,0 +1,17 @@ +Всего записей 1068 из них не совпадают 793 + +Топ - 5 образовний менеджеров +бакалавр 11 +менеджер 10 +специалист 6 +экономист 6 +экономист-менеджер 4 +Name: qualification, dtype: int64 + +Топ - 5 должностей инженеров +заместитель директора 3 +главный инженер 3 +ведущий инженер-конструктор 2 +инженер лесопользования 2 +директор 2 +Name: jobTitle, dtype: int64 \ No newline at end of file diff --git a/task.py b/task.py new file mode 100644 index 0000000..5998b3f --- /dev/null +++ b/task.py @@ -0,0 +1,50 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as mp + +works = pd.read_csv("works.csv") + +# 1 +print("Общее количество записей:", works.shape[0]) + +# 2 +print("Количество мужчин:", works[works["gender"] == "Мужской"].shape[0]) +print("Количество женщин:", (works["gender"] == "Женский").sum()) + +# 3 +print("Количество не NaN значений", works["skills"].count()) + +# 4 +print("Все заполненные скиллы\n", works['skills'].dropna()) + +# 5 +skills_bool = works['skills'].str.lower().str.contains('python | питон') & works['skills'].notnull() +print("Зарплата тех, у кого в скиллах есть Python (Питон)\n", works[skills_bool]['salary']) + +# 6 +salary_p = np.linspace(0.1, 1, 10) +w = works[works.gender == "Женский"]['salary'].quantile(salary_p) +m = works[works.gender == "Мужской"]['salary'].quantile(salary_p) + +mp.plot(m, salary_p, color='blue') +mp.plot(w, salary_p, color='r') +mp.xlabel('salary') +mp.ylabel('quantile') +mp.show() + +# 7 +men_salary = works.query("gender == 'Мужской'").groupby("educationType").agg("mean").reset_index() +women_salary = works.query("gender == 'Женский'").groupby("educationType").agg("mean").reset_index() + +educationTypes = men_salary["educationType"].values +men_salaries = men_salary["salary"].values +women_salary = women_salary["salary"].values + +index = np.arange(len(educationTypes)) + +bw = 0.4 +mp.bar(index-bw/2, men_salaries, bw, color="b", label="Средняя зарплата мужчин") +mp.bar(index+bw/2, women_salary, bw, color="r", label="Средняя зарплата женщин") +mp.xticks(index, educationTypes, rotation=45) +mp.legend() +mp.show() \ No newline at end of file