Skip to content

Commit 851a00d

Browse files
committed
add medium blog py examples
1 parent d58db8e commit 851a00d

File tree

1 file changed

+97
-0
lines changed

1 file changed

+97
-0
lines changed

blogs/medium_blog_inferences.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
# Import library
2+
import datazets as dz
3+
import numpy as np
4+
5+
6+
# Get the data science salary data set
7+
df = dz.get('ds_salaries')
8+
# The features are as following
9+
df.columns
10+
11+
titles = [['data scientist', 'data science', 'research', 'applied', 'specialist', 'ai', 'machine learning'],
12+
['engineer', 'etl'],
13+
['analyst', 'bi', 'business', 'product', 'modeler', 'analytics'],
14+
['manager', 'head', 'director'],
15+
['architect', 'cloud', 'aws'],
16+
['lead/principal', 'lead', 'principal'],
17+
]
18+
19+
# Aggregate job titles
20+
job_title = df['job_title'].str.lower().copy()
21+
df['job_title'] = 'Other'
22+
# Store the new names
23+
for t in titles:
24+
for name in t:
25+
df['job_title'][list(map(lambda x: name in x, job_title))]=t[0]
26+
print(df['job_title'].value_counts())
27+
28+
29+
# Rename catagorical variables for better understanding
30+
df['experience_level'] = df['experience_level'].replace({'EN': 'Entry-level', 'MI': 'Junior Mid-level', 'SE': 'Intermediate Senior-level', 'EX': 'Expert Executive-level / Director'}, regex=True)
31+
df['employment_type'] = df['employment_type'].replace({'PT': 'Part-time', 'FT': 'Full-time', 'CT': 'Contract', 'FL': 'Freelance'}, regex=True)
32+
df['company_size'] = df['company_size'].replace({'S': 'Small (less than 50)', 'M': 'Medium (50 to 250)', 'L': 'Large (>250)'}, regex=True)
33+
df['remote_ratio'] = df['remote_ratio'].replace({0: 'No remote', 50: 'Partially remote', 100: '>80% remote'}, regex=True)
34+
35+
# Add new feature
36+
df['country'] = 'USA'
37+
countries_europe = ['SM', 'DE', 'GB', 'ES', 'FR', 'RU', 'IT', 'NL', 'CH', 'CF', 'FI', 'UA', 'IE', 'GR', 'MK', 'RO', 'AL', 'LT', 'BA', 'LV', 'EE', 'AM', 'HR', 'SI', 'PT', 'HU', 'AT', 'SK', 'CZ', 'DK', 'BE', 'MD', 'MT']
38+
df['country'][np.isin(df['company_location'], countries_europe)]='europe'
39+
# Remove redundant variables
40+
salary_in_usd = df['salary_in_usd']
41+
df.drop(labels=['salary_currency', 'salary'], inplace=True, axis=1)
42+
43+
# %%
44+
import bnlearn as bn
45+
# Discretize the salary feature.
46+
discretize_method='manual'
47+
48+
# Discretize Manually
49+
if discretize_method=='manual':
50+
# Set salary
51+
df['salary_in_usd'] = None
52+
df['salary_in_usd'].loc[salary_in_usd<80000]='<80K'
53+
df['salary_in_usd'].loc[np.logical_and(salary_in_usd>=80000, salary_in_usd<100000)]='80-100K'
54+
df['salary_in_usd'].loc[np.logical_and(salary_in_usd>=100000, salary_in_usd<160000)]='100-160K'
55+
df['salary_in_usd'].loc[np.logical_and(salary_in_usd>=160000, salary_in_usd<250000)]='160-250K'
56+
df['salary_in_usd'].loc[salary_in_usd>=250000]='>250K'
57+
else:
58+
# Discretize automatically but with prior knowledge.
59+
tmpdf = df[['experience_level', 'salary_in_usd', 'country']]
60+
# Create edges
61+
edges = [('experience_level', 'salary_in_usd'), ('country', 'salary_in_usd')]
62+
# Create DAG based on edges
63+
DAG = bn.make_DAG(edges)
64+
bn.plot(DAG)
65+
# Discretize the continous columns
66+
df_disc = bn.discretize(tmpdf, edges, ["salary_in_usd"], max_iterations=1)
67+
# Store
68+
df['salary_in_usd'] = df_disc['salary_in_usd']
69+
# Print
70+
print(df['salary_in_usd'].value_counts())
71+
72+
# %%
73+
model = bn.structure_learning.fit(df, methodtype='hc', scoretype='bic')
74+
75+
# %%
76+
77+
# independence test
78+
model = bn.independence_test(model, df, prune=False)
79+
# Parameter learning to learn the CPTs. This step is required to make inferences.
80+
model = bn.parameter_learning.fit(model, df, methodtype="bayes")
81+
# Plot
82+
bn.plot(model, title='Salary data set')
83+
bn.plot(model, interactive=True, title='method=tan and score=bic')
84+
85+
# %%
86+
87+
query = bn.inference.fit(model, variables=['job_title'],
88+
evidence={'company_size': 'Large (>250)'})
89+
90+
91+
query = bn.inference.fit(model,
92+
variables=['salary_in_usd'],
93+
evidence={'employment_type': 'Full-time',
94+
'remote_ratio': 'Partially remote',
95+
'job_title': 'data scientist',
96+
'employee_residence': 'DE',
97+
'experience_level': 'Entry-level'})

0 commit comments

Comments
 (0)