udacity · Dramkadry · Mar 22, 2024 · Mar 26, 2024 · Mar 31, 2024 · Apr 5, 2024
@@ -11,3 +11,4 @@ my_feature_list.pkl
 .DS_Store
 __pycache__
 venv/
+tools/enron_mail_20150507.tar.gz
@@ -15,7 +15,74 @@
 
 """
 
+from tkinter.font import names
 import joblib
 
-enron_data = joblib.load(open("../final_project/final_project_dataset.pkl", "rb"))
+enron_data = joblib.load(open("./final_project/final_project_dataset.pkl", "rb"))
 
+# Print the first 5 items in the enron_data dictionary
+for i, (key, value) in enumerate(enron_data.items()):
+    if i >= 5:
+        break
+    print(f"{key}: {value}\n")
+
+# Print the number of data points (people) in the dataset
+print(f"Number of data points: {len(enron_data)}")
+# Print the number of features for each person in the dataset
+print(f"Number of features: {len(list(enron_data.values())[0])}")
+# Count the number of POIs in the dataset
+print(f"Number of POIs: {sum([1 for person in enron_data.values() if person['poi']== 1])}") # == 1 or == True or only if person['poi'] without == 1
+# form the list of POIs names in /final_project/poi_names.txt and print the number of POIs
+poi_names = [name for name in open("./final_project/poi_names.txt").read().split("\n") if name not in ["", "http://usatoday30.usatoday.com/money/industries/energy/2005-12-28-enron-participants_x.htm"]]
+print(f"Number of POIs: {len(poi_names)}")
+print(poi_names)
+
+# List of features for each person in the dataset
+first_person_features = list(enron_data.values())[0]
+# List of names in the dataset
+names_enron_data = list(enron_data.keys())
+
+
+# Print the total value of the stock belonging to James Prentice
+stock_features = [feature for feature in first_person_features.keys() if 'stock' in feature.lower() and 'total' in feature.lower()]
+james_prentice_name = [name for name in names_enron_data if 'james' in name.lower() and 'prentice' in name.lower()]
+# print(f"Stock features: {stock_features}")
+# print(f"Name of the person: {james_prentice_name}")
+
+# Print the total value of the stock belonging to James Prentice
+print(f"Total stock value of James Prentice: {enron_data[james_prentice_name[0]][stock_features[0]]}")
+
+
+# Print the total value of the stock belonging to Wesley Colwell
+wesley_colwell_name = [name for name in names_enron_data if 'wesley' in name.lower() and 'colwell' in name.lower()]
+# print(f"Name of the person: {wesley_colwell_name}")
+# Print Number of emails sent from Wesley Colwell to POIs
+print(f"Total Number of emails of Wesley Colwell: {enron_data[wesley_colwell_name[0]]['from_this_person_to_poi']}")
+
+
+# Print the  value of stock options belonging to Jeffrey K Skillin
+jeffrey_skillin_name = [name for name in names_enron_data if 'jeffrey' in name.lower() and 'skillin' in name.lower()]
+# print(f"Name of the person: {wesley_colwell_name}")
+
+# Print the value of stock options belonging to Jeffrey K Skillin
+print(f"The value of stock options belonging to Jeffrey K Skillin: {enron_data[jeffrey_skillin_name[0]]['exercised_stock_options']}")
+
+# Print the  value of total payments to Lay, Skilling and Fastow
+# Lay_Skilling_Fastow_names = [name for name in names_enron_data if 'jeffrey' in name.lower() or 'lay' in name.lower() or 'fastow' in name.lower()]
+Lay_Skilling_Fastow_names = ['LAY KENNETH L', 'FASTOW ANDREW S', 'SKILLING JEFFREY K']
+print(f"Name of the person: {Lay_Skilling_Fastow_names}")
+total_payments = {name: enron_data[name]['total_payments'] for name in Lay_Skilling_Fastow_names}
+# Print the name of the person with the highest total payments
+max_total_payments = max(total_payments, key=total_payments.get)
+print(f"Name of the person with the highest total payments: {max_total_payments}")
+
+# Print the number of people with a quantified salary
+people_with_quantified_salary = [person for person in enron_data.values() if person['salary'] != 'NaN']
+people_with_known_emails = [person for person in enron_data.values() if person['email_address'] != 'NaN']
+
+print(f'number of persons with known salarry:', len(people_with_quantified_salary))
+print(f'number of persons with known emails:', len(people_with_known_emails))
+
+sys.path.append("./tools/")
+from feature_format import featureFormat
+enron_data_array = featureFormat(enron_data, first_person_features)
@@ -10,7 +10,7 @@
 
 import sys
 from time import time
-sys.path.append("../tools/")
+sys.path.append("./tools/")
 from email_preprocess import preprocess
 
 
@@ -24,8 +24,63 @@
 
 #########################################################
 ### your code goes here ###
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.metrics import accuracy_score
 
+# Create a Decision Tree Classifier (DT) object
+t0 = time()
+clf = DecisionTreeClassifier(random_state=0, min_samples_split=40)
+clf.fit(features_train, labels_train)        
+print("Training Time:", round(time()-t0, 3), "s")
 
 #########################################################
 
+pred = clf.predict(features_test)
+accuracy = clf.score(features_test, labels_test)
 
+acc = accuracy_score(pred, labels_test)
+
+print("Accuracy:", round(accuracy,3))
+print("Metrics Accuracy:", round(acc, 3))
+
+#########################################################
+
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.metrics import accuracy_score
+
+# Create a AdaBoost Classifier (AB) object
+t0 = time()
+clf = AdaBoostClassifier(n_estimators=100, random_state=0)
+clf.fit(features_train, labels_train)
+print("Training Time:", round(time()-t0, 3), "s")
+
+acc = accuracy_score(clf.predict(features_test), labels_test)
+
+print("Metrics Accuracy:", round(acc, 3))
+
+
+#########################################################
+from sklearn.neighbors import KNeighborsClassifier
+
+# Create a KNeighbors Classifier (KNN) object
+t0 = time()
+clf = KNeighborsClassifier(n_neighbors=3)
+clf.fit(features_train, labels_train)
+print("Training Time:", round(time()-t0, 3), "s")
+
+acc = accuracy_score(clf.predict(features_test), labels_test)
+print("Metrics Accuracy:", round(acc, 3))
+
+
+#########################################################
+
+from sklearn.ensemble import RandomForestClassifier
+
+# Create a RandomForest Classifier (RF) object
+t0 = time()
+clf = RandomForestClassifier(n_estimators=100, random_state=0)
+clf.fit(features_train, labels_train)
+print("Training Time:", round(time()-t0, 3), "s")
+
+acc = accuracy_score(clf.predict(features_test), labels_test)
+print("Metrics Accuracy:", round(acc, 3))
@@ -9,10 +9,10 @@
     Sara has label 0
     Chris has label 1
 """
-    
+
 import sys
 from time import time
-sys.path.append("../tools/")
+sys.path.append("./tools/")
 from email_preprocess import preprocess
 
 
@@ -24,10 +24,27 @@
 
 ##############################################################
 # Enter Your Code Here
+from sklearn.naive_bayes import GaussianNB
 
+t0 = time()
+clf = GaussianNB()
+clf.fit(features_train, labels_train)
+print("Training Time:", round(time()-t0, 3), "s")
 
+t0 = time()
+pred = clf.predict(features_test)
+print("Predicting Time:", round(time()-t0, 3), "s")
+
+accuracy = clf.score(features_test, labels_test)
+
+from sklearn.metrics import accuracy_score
+acc = accuracy_score(pred, labels_test)
+
+print("Accuracy:", round(accuracy, 3))
+
+print("Matice Accuracy:", round(acc, 3))
+#########################################
 
-##############################################################
 
 ##############################################################
 '''

@@ -10,7 +10,7 @@
 
 import sys
 from time import time
-sys.path.append("../tools/")
+sys.path.append("./tools/")
 from email_preprocess import preprocess
 
 
@@ -19,21 +19,117 @@
 ### labels_train and labels_test are the corresponding item labels
 features_train, features_test, labels_train, labels_test = preprocess()
 
-
 #########################################################
 ### your code goes here ###
+from sklearn.svm import SVC
+from sklearn.metrics import accuracy_score
 
+# Create a Support Vector Classifier (SVC) object with a linear kernel
+clf = SVC(kernel='linear')
 
-#########################################################
+# Record the start time for training
+t0 = time()
+clf.fit(features_train, labels_train)
+print("Training Time:", round(time()-t0, 3), "s")
+
+# Record the start time for predicting
+t0 = time()
+pred = clf.predict(features_test)
+print("Predicting Time:", round(time()-t0, 3), "s")
+
+# Calculate and print the accuracy of the model using the test data
+accuracy = clf.score(features_test, labels_test)
+print("Accuracy:", round(accuracy, 3))
 
+# Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels
+acc = accuracy_score(pred, labels_test)
+print("Metrics Accuracy:", round(acc, 3))
+
+
+#########################################################
+# Training on smaller datasets
 #########################################################
 '''
 You'll be Provided similar code in the Quiz
 But the Code provided in Quiz has an Indexing issue
 The Code Below solves that issue, So use this one
 '''
 
-# features_train = features_train[:int(len(features_train)/100)]
-# labels_train = labels_train[:int(len(labels_train)/100)]
+# Reduce the size of the features_train list to 1% of its original size
+features_train = features_train[:int(len(features_train)/100)]
+
+# Reduce the size of the labels_train list to 1% of its original size
+labels_train = labels_train[:int(len(labels_train)/100)]
+
+
+# Record the start time for training
+t0 = time()
+clf.fit(features_train, labels_train)
+print("Training Time:", round(time()-t0, 3), "s")
+
+# Record the start time for predicting
+t0 = time()
+pred = clf.predict(features_test)
+print("Predicting Time:", round(time()-t0, 3), "s")
+
+# Calculate and print the accuracy of the model using the test data
+accuracy = clf.score(features_test, labels_test)
+print("Accuracy:", round(accuracy, 3))
+
+# Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels
+acc = accuracy_score(pred, labels_test)
+print("Metrics Accuracy:", round(acc, 3))
 
 #########################################################
+# running the modle with RBF kernal on the small dataset
+########################################################
+# remember this modle running on the smaller data set of 1% of the oreginal data set
+# Create a Support Vector Classifier (SVC) object with a RBF kernel
+clf = SVC(kernel='rbf')
+
+# Record the start time for training
+t0 = time()
+clf.fit(features_train, labels_train)
+print("Training Time:", round(time()-t0, 3), "s")
+
+# Record the start time for predicting
+t0 = time()
+pred = clf.predict(features_test)
+print("Predicting Time:", round(time()-t0, 3), "s")
+
+# Calculate and print the accuracy of the model using the test data
+accuracy = clf.score(features_test, labels_test)
+print("Accuracy:", round(accuracy, 3))
+
+# Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels
+acc = accuracy_score(pred, labels_test)
+print("Metrics Accuracy:", round(acc, 3))
+
+#########################################################
+# running the modle with different C values (10.0, 100., 1000., and 10000) kernal on the small dataset
+########################################################
+# remember this modle running on the smaller data set of 1% of the oreginal data set
+# Create a Support Vector Classifier (SVC) object with a RBF kernel
+c_values = [10.0, 100.0, 1000.0, 10000]
+
+for c in c_values:
+    clf = SVC(kernel='rbf', C=c)
+
+    # Record the start time for training
+    t0 = time()
+    clf.fit(features_train, labels_train)
+    print("Training Time:", round(time()-t0, 3), "s")
+
+    # Record the start time for predicting
+    t0 = time()
+    pred = clf.predict(features_test)
+    print("Predicting Time:", round(time()-t0, 3), "s")
+
+    # Calculate and print the accuracy of the model using the test data
+    accuracy = clf.score(features_test, labels_test)
+    print("Accuracy:", round(accuracy, 3))
+
+    # Import the accuracy_score function from scikit-learn's metrics module and calculate the accuracy of the model using the predicted values and actual labels
+    acc = accuracy_score(pred, labels_test)
+    print("Metrics Accuracy:", round(acc, 3))
+
@@ -8,7 +8,7 @@
 from sklearn.feature_selection import SelectPercentile, f_classif
 
 
-def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"):
+def preprocess(words_file = "./tools/word_data.pkl", authors_file="./tools/email_authors.pkl"):
     """ 
         this function takes a pre-made list of email texts (by default word_data.pkl)
         and the corresponding authors (by default email_authors.pkl) and performs
@@ -48,7 +48,7 @@ def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/ema
 
     ### feature selection, because text is super high dimensional and 
     ### can be really computationally chewy as a result
-    selector = SelectPercentile(f_classif, percentile=10)
+    selector = SelectPercentile(f_classif, percentile=1)
     selector.fit(features_train_transformed, labels_train)
     features_train_transformed = selector.transform(features_train_transformed).toarray()
     features_test_transformed  = selector.transform(features_test_transformed).toarray()