diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..73f69e0 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml +# Editor-based HTTP Client requests +/httpRequests/ diff --git a/.idea/.name b/.idea/.name new file mode 100644 index 0000000..83e821a --- /dev/null +++ b/.idea/.name @@ -0,0 +1 @@ +task.py \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..d1e22ec --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..2cdf9da --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/pandas_task.iml b/.idea/pandas_task.iml new file mode 100644 index 0000000..d0876a7 --- /dev/null +++ b/.idea/pandas_task.iml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/HomeWork.py b/HomeWork.py new file mode 100644 index 0000000..68c9185 --- /dev/null +++ b/HomeWork.py @@ -0,0 +1,31 @@ +import pandas as pd + + +def count_not_match_fields(field1, field2, data): + res_count = 0 + for (f1, f2) in zip(data[field1], data[field2]): + if not is_match(f1, f2) and not is_match(f2, f1): + res_count += 1 + return res_count + + +def get_top(size, data, searched, returned, search_str): + return data[data[searched].str.lower().str.contains(search_str[:-2])][returned].str.lower().value_counts().head(size) + + +def is_match(field1, field2): + for word in field1.lower().replace('-', ' ').split(): + if word in field2.lower(): + return True + return False + + +works = pd.read_csv("works.csv").dropna() +count_not_match = count_not_match_fields("jobTitle", "qualification", works) +print(f"Из {works.shape[0]} людей не совпадают профессия и должность у {count_not_match}") + +print("\nТоп образований людей, которые работают менеджерами") +print(get_top(5, works, "jobTitle", "qualification", "менеджер")) + +print("\nТоп должностей людей, которые по диплому являются инженерами") +print(get_top(5, works, "qualification", "jobTitle", "инженер")) diff --git a/Result.md b/Result.md new file mode 100644 index 0000000..3af57c7 --- /dev/null +++ b/Result.md @@ -0,0 +1,15 @@ +1. Из 1068 людей не совпадают профессия и должность у 793 + +2. Топ образований людей, которые работают менеджерами +бакалавр 11 +менеджер 10 +специалист 6 +экономист 6 +экономист-менеджер 4 + +3. Топ должностей людей, которые по диплому являются инженерами +заместитель директора 3 +главный инженер 3 +ведущий инженер-конструктор 2 +инженер лесопользования 2 +директор 2 \ No newline at end of file diff --git a/lec.py b/lec.py new file mode 100644 index 0000000..5594a9d --- /dev/null +++ b/lec.py @@ -0,0 +1,39 @@ +import numpy as np +import pandas as pd +import matplotlib as plt + +# 1 task +print(works.info()) +print(works.shape[0]) +print(len(works.index)) + +# 2 task +print(works[works["gender"] == "Мужской"].shape[0]) +print((works["gender"] == "Женский").sum()) +print(works["gender"].value_counts()) + +# 3 task +print(works["skills"].notnull().sum()) +print(works.info()) +print(works["skills"].count()) +print(works[works["skills"].notnull()]["skills"]) +print(works['skills'].dropna()) +print(works.query("skills == skills")["skills"]) +print(works.query('salary == 15000')) + +# 4 task +edu = 'Высшее' +gen = 'Мужской' +print(works.query("educationType == @edu and gender == @gen")[['salary', "educationType", "gender"]]) + +# 5 task +mask = works["skills"].str.lower().str.contains("python|питон") & works["skills"].notnull() +print(works[mask]["salary"]) + +# 6 task +works = pd.read_csv("./works.csv") +person = np.linspace(.1, 1, 10) +men = works.query('gender == "Мужской"').quantile(person) +women = works.query('gender == "Женский"').quantile(person) +print(men) +print(women)