Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Commit #375

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ my_feature_list.pkl
.DS_Store
__pycache__
venv/
tools/enron_mail_20150507.tar.gz
69 changes: 68 additions & 1 deletion datasets_questions/explore_enron_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,74 @@

"""

from tkinter.font import names
import joblib

enron_data = joblib.load(open("../final_project/final_project_dataset.pkl", "rb"))
enron_data = joblib.load(open("./final_project/final_project_dataset.pkl", "rb"))

# Print the first 5 items in the enron_data dictionary
for i, (key, value) in enumerate(enron_data.items()):
if i >= 5:
break
print(f"{key}: {value}\n")

# Print the number of data points (people) in the dataset
print(f"Number of data points: {len(enron_data)}")
# Print the number of features for each person in the dataset
print(f"Number of features: {len(list(enron_data.values())[0])}")
# Count the number of POIs in the dataset
print(f"Number of POIs: {sum([1 for person in enron_data.values() if person['poi']== 1])}") # == 1 or == True or only if person['poi'] without == 1
# form the list of POIs names in /final_project/poi_names.txt and print the number of POIs
poi_names = [name for name in open("./final_project/poi_names.txt").read().split("\n") if name not in ["", "http://usatoday30.usatoday.com/money/industries/energy/2005-12-28-enron-participants_x.htm"]]
print(f"Number of POIs: {len(poi_names)}")
print(poi_names)

# List of features for each person in the dataset
first_person_features = list(enron_data.values())[0]
# List of names in the dataset
names_enron_data = list(enron_data.keys())


# Print the total value of the stock belonging to James Prentice
stock_features = [feature for feature in first_person_features.keys() if 'stock' in feature.lower() and 'total' in feature.lower()]
james_prentice_name = [name for name in names_enron_data if 'james' in name.lower() and 'prentice' in name.lower()]
# print(f"Stock features: {stock_features}")
# print(f"Name of the person: {james_prentice_name}")

# Print the total value of the stock belonging to James Prentice
print(f"Total stock value of James Prentice: {enron_data[james_prentice_name[0]][stock_features[0]]}")


# Print the total value of the stock belonging to Wesley Colwell
wesley_colwell_name = [name for name in names_enron_data if 'wesley' in name.lower() and 'colwell' in name.lower()]
# print(f"Name of the person: {wesley_colwell_name}")
# Print Number of emails sent from Wesley Colwell to POIs
print(f"Total Number of emails of Wesley Colwell: {enron_data[wesley_colwell_name[0]]['from_this_person_to_poi']}")


# Print the value of stock options belonging to Jeffrey K Skillin
jeffrey_skillin_name = [name for name in names_enron_data if 'jeffrey' in name.lower() and 'skillin' in name.lower()]
# print(f"Name of the person: {wesley_colwell_name}")

# Print the value of stock options belonging to Jeffrey K Skillin
print(f"The value of stock options belonging to Jeffrey K Skillin: {enron_data[jeffrey_skillin_name[0]]['exercised_stock_options']}")

# Print the value of total payments to Lay, Skilling and Fastow
# Lay_Skilling_Fastow_names = [name for name in names_enron_data if 'jeffrey' in name.lower() or 'lay' in name.lower() or 'fastow' in name.lower()]
Lay_Skilling_Fastow_names = ['LAY KENNETH L', 'FASTOW ANDREW S', 'SKILLING JEFFREY K']
print(f"Name of the person: {Lay_Skilling_Fastow_names}")
total_payments = {name: enron_data[name]['total_payments'] for name in Lay_Skilling_Fastow_names}
# Print the name of the person with the highest total payments
max_total_payments = max(total_payments, key=total_payments.get)
print(f"Name of the person with the highest total payments: {max_total_payments}")

# Print the number of people with a quantified salary
people_with_quantified_salary = [person for person in enron_data.values() if person['salary'] != 'NaN']
people_with_known_emails = [person for person in enron_data.values() if person['email_address'] != 'NaN']

print(f'number of persons with known salarry:', len(people_with_quantified_salary))
print(f'number of persons with known emails:', len(people_with_known_emails))

sys.path.append("./tools/")
from feature_format import featureFormat
enron_data_array = featureFormat(enron_data, first_person_features)
57 changes: 56 additions & 1 deletion decision_tree/dt_author_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import sys
from time import time
sys.path.append("../tools/")
sys.path.append("./tools/")
from email_preprocess import preprocess


Expand All @@ -24,8 +24,63 @@

#########################################################
### your code goes here ###
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Create a Decision Tree Classifier (DT) object
t0 = time()
clf = DecisionTreeClassifier(random_state=0, min_samples_split=40)
clf.fit(features_train, labels_train)
print("Training Time:", round(time()-t0, 3), "s")

#########################################################

pred = clf.predict(features_test)
accuracy = clf.score(features_test, labels_test)

acc = accuracy_score(pred, labels_test)

print("Accuracy:", round(accuracy,3))
print("Metrics Accuracy:", round(acc, 3))

#########################################################

from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score

# Create a AdaBoost Classifier (AB) object
t0 = time()
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(features_train, labels_train)
print("Training Time:", round(time()-t0, 3), "s")

acc = accuracy_score(clf.predict(features_test), labels_test)

print("Metrics Accuracy:", round(acc, 3))


#########################################################
from sklearn.neighbors import KNeighborsClassifier

# Create a KNeighbors Classifier (KNN) object
t0 = time()
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(features_train, labels_train)
print("Training Time:", round(time()-t0, 3), "s")

acc = accuracy_score(clf.predict(features_test), labels_test)
print("Metrics Accuracy:", round(acc, 3))


#########################################################

from sklearn.ensemble import RandomForestClassifier

# Create a RandomForest Classifier (RF) object
t0 = time()
clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(features_train, labels_train)
print("Training Time:", round(time()-t0, 3), "s")

acc = accuracy_score(clf.predict(features_test), labels_test)
print("Metrics Accuracy:", round(acc, 3))
23 changes: 20 additions & 3 deletions naive_bayes/nb_author_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@
Sara has label 0
Chris has label 1
"""

import sys
from time import time
sys.path.append("../tools/")
sys.path.append("./tools/")
from email_preprocess import preprocess


Expand All @@ -24,10 +24,27 @@

##############################################################
# Enter Your Code Here
from sklearn.naive_bayes import GaussianNB

t0 = time()
clf = GaussianNB()
clf.fit(features_train, labels_train)
print("Training Time:", round(time()-t0, 3), "s")

t0 = time()
pred = clf.predict(features_test)
print("Predicting Time:", round(time()-t0, 3), "s")

accuracy = clf.score(features_test, labels_test)

from sklearn.metrics import accuracy_score
acc = accuracy_score(pred, labels_test)

print("Accuracy:", round(accuracy, 3))

print("Matice Accuracy:", round(acc, 3))
#########################################

##############################################################

##############################################################
'''
Expand Down
106 changes: 101 additions & 5 deletions svm/svm_author_id.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

import sys
from time import time
sys.path.append("../tools/")
sys.path.append("./tools/")
from email_preprocess import preprocess


Expand All @@ -19,21 +19,117 @@
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()


#########################################################
### your code goes here ###
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Create a Support Vector Classifier (SVC) object with a linear kernel
clf = SVC(kernel='linear')

#########################################################
# Record the start time for training
t0 = time()
clf.fit(features_train, labels_train)
print("Training Time:", round(time()-t0, 3), "s")

# Record the start time for predicting
t0 = time()
pred = clf.predict(features_test)
print("Predicting Time:", round(time()-t0, 3), "s")

# Calculate and print the accuracy of the model using the test data
accuracy = clf.score(features_test, labels_test)
print("Accuracy:", round(accuracy, 3))

# Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels
acc = accuracy_score(pred, labels_test)
print("Metrics Accuracy:", round(acc, 3))


#########################################################
# Training on smaller datasets
#########################################################
'''
You'll be Provided similar code in the Quiz
But the Code provided in Quiz has an Indexing issue
The Code Below solves that issue, So use this one
'''

# features_train = features_train[:int(len(features_train)/100)]
# labels_train = labels_train[:int(len(labels_train)/100)]
# Reduce the size of the features_train list to 1% of its original size
features_train = features_train[:int(len(features_train)/100)]

# Reduce the size of the labels_train list to 1% of its original size
labels_train = labels_train[:int(len(labels_train)/100)]


# Record the start time for training
t0 = time()
clf.fit(features_train, labels_train)
print("Training Time:", round(time()-t0, 3), "s")

# Record the start time for predicting
t0 = time()
pred = clf.predict(features_test)
print("Predicting Time:", round(time()-t0, 3), "s")

# Calculate and print the accuracy of the model using the test data
accuracy = clf.score(features_test, labels_test)
print("Accuracy:", round(accuracy, 3))

# Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels
acc = accuracy_score(pred, labels_test)
print("Metrics Accuracy:", round(acc, 3))

#########################################################
# running the modle with RBF kernal on the small dataset
########################################################
# remember this modle running on the smaller data set of 1% of the oreginal data set
# Create a Support Vector Classifier (SVC) object with a RBF kernel
clf = SVC(kernel='rbf')

# Record the start time for training
t0 = time()
clf.fit(features_train, labels_train)
print("Training Time:", round(time()-t0, 3), "s")

# Record the start time for predicting
t0 = time()
pred = clf.predict(features_test)
print("Predicting Time:", round(time()-t0, 3), "s")

# Calculate and print the accuracy of the model using the test data
accuracy = clf.score(features_test, labels_test)
print("Accuracy:", round(accuracy, 3))

# Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels
acc = accuracy_score(pred, labels_test)
print("Metrics Accuracy:", round(acc, 3))

#########################################################
# running the modle with different C values (10.0, 100., 1000., and 10000) kernal on the small dataset
########################################################
# remember this modle running on the smaller data set of 1% of the oreginal data set
# Create a Support Vector Classifier (SVC) object with a RBF kernel
c_values = [10.0, 100.0, 1000.0, 10000]

for c in c_values:
clf = SVC(kernel='rbf', C=c)

# Record the start time for training
t0 = time()
clf.fit(features_train, labels_train)
print("Training Time:", round(time()-t0, 3), "s")

# Record the start time for predicting
t0 = time()
pred = clf.predict(features_test)
print("Predicting Time:", round(time()-t0, 3), "s")

# Calculate and print the accuracy of the model using the test data
accuracy = clf.score(features_test, labels_test)
print("Accuracy:", round(accuracy, 3))

# Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels
acc = accuracy_score(pred, labels_test)
print("Metrics Accuracy:", round(acc, 3))

4 changes: 2 additions & 2 deletions tools/email_preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from sklearn.feature_selection import SelectPercentile, f_classif


def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"):
def preprocess(words_file = "./tools/word_data.pkl", authors_file="./tools/email_authors.pkl"):
"""
this function takes a pre-made list of email texts (by default word_data.pkl)
and the corresponding authors (by default email_authors.pkl) and performs
Expand Down Expand Up @@ -48,7 +48,7 @@ def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/ema

### feature selection, because text is super high dimensional and
### can be really computationally chewy as a result
selector = SelectPercentile(f_classif, percentile=10)
selector = SelectPercentile(f_classif, percentile=1)
selector.fit(features_train_transformed, labels_train)
features_train_transformed = selector.transform(features_train_transformed).toarray()
features_test_transformed = selector.transform(features_test_transformed).toarray()
Expand Down