|
| 1 | +# Import library |
| 2 | +import datazets as dz |
| 3 | +import numpy as np |
| 4 | + |
| 5 | + |
| 6 | +# Get the data science salary data set |
| 7 | +df = dz.get('ds_salaries') |
| 8 | +# The features are as following |
| 9 | +df.columns |
| 10 | + |
| 11 | +titles = [['data scientist', 'data science', 'research', 'applied', 'specialist', 'ai', 'machine learning'], |
| 12 | + ['engineer', 'etl'], |
| 13 | + ['analyst', 'bi', 'business', 'product', 'modeler', 'analytics'], |
| 14 | + ['manager', 'head', 'director'], |
| 15 | + ['architect', 'cloud', 'aws'], |
| 16 | + ['lead/principal', 'lead', 'principal'], |
| 17 | + ] |
| 18 | + |
| 19 | +# Aggregate job titles |
| 20 | +job_title = df['job_title'].str.lower().copy() |
| 21 | +df['job_title'] = 'Other' |
| 22 | +# Store the new names |
| 23 | +for t in titles: |
| 24 | + for name in t: |
| 25 | + df['job_title'][list(map(lambda x: name in x, job_title))]=t[0] |
| 26 | +print(df['job_title'].value_counts()) |
| 27 | + |
| 28 | + |
| 29 | +# Rename catagorical variables for better understanding |
| 30 | +df['experience_level'] = df['experience_level'].replace({'EN': 'Entry-level', 'MI': 'Junior Mid-level', 'SE': 'Intermediate Senior-level', 'EX': 'Expert Executive-level / Director'}, regex=True) |
| 31 | +df['employment_type'] = df['employment_type'].replace({'PT': 'Part-time', 'FT': 'Full-time', 'CT': 'Contract', 'FL': 'Freelance'}, regex=True) |
| 32 | +df['company_size'] = df['company_size'].replace({'S': 'Small (less than 50)', 'M': 'Medium (50 to 250)', 'L': 'Large (>250)'}, regex=True) |
| 33 | +df['remote_ratio'] = df['remote_ratio'].replace({0: 'No remote', 50: 'Partially remote', 100: '>80% remote'}, regex=True) |
| 34 | + |
| 35 | +# Add new feature |
| 36 | +df['country'] = 'USA' |
| 37 | +countries_europe = ['SM', 'DE', 'GB', 'ES', 'FR', 'RU', 'IT', 'NL', 'CH', 'CF', 'FI', 'UA', 'IE', 'GR', 'MK', 'RO', 'AL', 'LT', 'BA', 'LV', 'EE', 'AM', 'HR', 'SI', 'PT', 'HU', 'AT', 'SK', 'CZ', 'DK', 'BE', 'MD', 'MT'] |
| 38 | +df['country'][np.isin(df['company_location'], countries_europe)]='europe' |
| 39 | +# Remove redundant variables |
| 40 | +salary_in_usd = df['salary_in_usd'] |
| 41 | +df.drop(labels=['salary_currency', 'salary'], inplace=True, axis=1) |
| 42 | + |
| 43 | +# %% |
| 44 | +import bnlearn as bn |
| 45 | +# Discretize the salary feature. |
| 46 | +discretize_method='manual' |
| 47 | + |
| 48 | +# Discretize Manually |
| 49 | +if discretize_method=='manual': |
| 50 | + # Set salary |
| 51 | + df['salary_in_usd'] = None |
| 52 | + df['salary_in_usd'].loc[salary_in_usd<80000]='<80K' |
| 53 | + df['salary_in_usd'].loc[np.logical_and(salary_in_usd>=80000, salary_in_usd<100000)]='80-100K' |
| 54 | + df['salary_in_usd'].loc[np.logical_and(salary_in_usd>=100000, salary_in_usd<160000)]='100-160K' |
| 55 | + df['salary_in_usd'].loc[np.logical_and(salary_in_usd>=160000, salary_in_usd<250000)]='160-250K' |
| 56 | + df['salary_in_usd'].loc[salary_in_usd>=250000]='>250K' |
| 57 | +else: |
| 58 | + # Discretize automatically but with prior knowledge. |
| 59 | + tmpdf = df[['experience_level', 'salary_in_usd', 'country']] |
| 60 | + # Create edges |
| 61 | + edges = [('experience_level', 'salary_in_usd'), ('country', 'salary_in_usd')] |
| 62 | + # Create DAG based on edges |
| 63 | + DAG = bn.make_DAG(edges) |
| 64 | + bn.plot(DAG) |
| 65 | + # Discretize the continous columns |
| 66 | + df_disc = bn.discretize(tmpdf, edges, ["salary_in_usd"], max_iterations=1) |
| 67 | + # Store |
| 68 | + df['salary_in_usd'] = df_disc['salary_in_usd'] |
| 69 | + # Print |
| 70 | + print(df['salary_in_usd'].value_counts()) |
| 71 | + |
| 72 | +# %% |
| 73 | +model = bn.structure_learning.fit(df, methodtype='hc', scoretype='bic') |
| 74 | + |
| 75 | +# %% |
| 76 | + |
| 77 | +# independence test |
| 78 | +model = bn.independence_test(model, df, prune=False) |
| 79 | +# Parameter learning to learn the CPTs. This step is required to make inferences. |
| 80 | +model = bn.parameter_learning.fit(model, df, methodtype="bayes") |
| 81 | +# Plot |
| 82 | +bn.plot(model, title='Salary data set') |
| 83 | +bn.plot(model, interactive=True, title='method=tan and score=bic') |
| 84 | + |
| 85 | +# %% |
| 86 | + |
| 87 | +query = bn.inference.fit(model, variables=['job_title'], |
| 88 | + evidence={'company_size': 'Large (>250)'}) |
| 89 | + |
| 90 | + |
| 91 | +query = bn.inference.fit(model, |
| 92 | + variables=['salary_in_usd'], |
| 93 | + evidence={'employment_type': 'Full-time', |
| 94 | + 'remote_ratio': 'Partially remote', |
| 95 | + 'job_title': 'data scientist', |
| 96 | + 'employee_residence': 'DE', |
| 97 | + 'experience_level': 'Entry-level'}) |
0 commit comments