From 06a199348cdeee022122b4f8c3a63b1b1b9730a1 Mon Sep 17 00:00:00 2001 From: jinxianwie <2948950250@qq.com> Date: Tue, 3 Oct 2023 11:44:03 +0800 Subject: [PATCH 1/8] [Feature] Support PCA for classification --- machine_learning/pca/pca_cla.py | 88 +++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 machine_learning/pca/pca_cla.py diff --git a/machine_learning/pca/pca_cla.py b/machine_learning/pca/pca_cla.py new file mode 100644 index 0000000..e5a216e --- /dev/null +++ b/machine_learning/pca/pca_cla.py @@ -0,0 +1,88 @@ +import os +import sys +current_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.insert(0, current_path) +import matplotlib.pyplot as plt + +from sklearn.decomposition import PCA +from sklearn.model_selection import train_test_split +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import classification_report +from sklearn.preprocessing import StandardScaler +import matplotlib.pyplot as plt + +from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score + +from utils.read_csv_data import read_csv_data + +def main(): + # 1. 读取csv数据 + name_dict, data = read_csv_data("dataset/iris.csv") + # 鸢尾花数据集三类标签 + label_dict = {0: 'setosa', 1: 'versicolor', 2: 'virginica'} + label_list = ['setosa', 'versicolor', 'virginica'] + + # 2. 确定特征和标签 + x = data[:, :-1] + y = data[:, -1] + + # 3. 处理特征 + # 在主成分分析PCA之前,需要对特征进行标准化,确保所有特征在相同尺度下均衡 + x = StandardScaler().fit_transform(x) + + # 4. 划分训练集和测试集 + x_train, x_test, y_train, y_test= train_test_split(x, y, stratify=y, test_size=0.5, random_state=0) + # 在训练过程中,可用的只有训练集, 测试集的数据边换也需要根据训练集的数据进行变换 + x_t = StandardScaler().fit(x_train) + x_train = x_t.transform(x_train) + x_test = x_t.transform(x_test) + + # 对训练集和测试集分别进行PCA降维处理 + k = 0.98 # 设置降维占比 + pca = PCA(n_components=k) + x_train_pca = pca.fit_transform(x_train) # 在训练集上拟合模型并进行降维 + x_test_pca = pca.transform(x_test) # 将测试集降维 + print("主成分的数量: {}".format(pca.n_components_)) + # 结果显示含义为:当维度降低到xx时,保留了原特征98%的信息 + + # 5. 利用降维后的训练集建立逻辑回归模型 + model = LogisticRegression() + model.fit(x_train_pca, y_train) + + # 6. 对降维后的测试集进行分类,并进行模型评估 + y_pred = model.predict(x_test_pca) + accuracy = accuracy_score(y_true=y_test, y_pred=y_pred) + precision = precision_score(y_true=y_test, y_pred=y_pred, average='macro') + recall = recall_score(y_true=y_test, y_pred=y_pred, average='macro') + f1 = f1_score(y_true=y_test, y_pred=y_pred, average='macro') + print(f"精确率为{accuracy}, 准确度为{precision}, 召回率为{recall}, F1分数为{f1}") + + report = classification_report(y_true=y_test, y_pred=y_pred) + print(report) + + # 对降维后的前两个主成分进行类别的可视化 + plt.figure() + colors = ["navy", "turquoise", "darkorange"] + lw = 2 + for color, i, target_name in zip(colors, [0, 1, 2], label_list): + plt.scatter( + x_train_pca[y_train == i, 0], x_train_pca[y_train == i, 1], color=color, alpha=0.8, lw=lw, label=target_name + ) + plt.legend(loc="best", shadow=False, scatterpoints=1) + plt.title("x_train_pca of IRIS dataset") + plt.savefig("./x_train_pca.png") + + plt.figure() + colors = ["navy", "turquoise", "darkorange"] + lw = 2 + for color, i, target_name in zip(colors, [0, 1, 2], label_list): + plt.scatter( + x_test_pca[y_test == i, 0], x_test_pca[y_test == i, 1], color=color, alpha=0.8, lw=lw, label=target_name + ) + plt.legend(loc="best", shadow=False, scatterpoints=1) + plt.title("x_test_pca of IRIS dataset") + plt.savefig("./x_test_pca.png") + + +if __name__ == "__main__": + main() \ No newline at end of file From 73f3c50c0aba087070d91043ab7750b1994d90b3 Mon Sep 17 00:00:00 2001 From: jinxianwie <2948950250@qq.com> Date: Tue, 3 Oct 2023 11:49:38 +0800 Subject: [PATCH 2/8] update README.md --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6e84f6d..33f8ebc 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,11 @@ - [x] SVM-(分类任务) 不同SVM分类器在测试集决策边界的可视化 ![svm](https://github.com/jinxianwei/CloudImg/assets/81373517/36703295-9af3-406e-b8bb-728c77852bb8) - +- [x] LogisticRegression with PCA +训练集特征前两个主成分在类别上的可视化 +![x_train_pca](https://github.com/jinxianwei/CloudImg/assets/81373517/00878756-df1f-4e64-a04b-213371fda10b) +测试集特征前两个主成分在类别上的可视化 +![x_test_pca](https://github.com/jinxianwei/CloudImg/assets/81373517/d14fa1de-e5bf-46f2-8707-91d86bb2be21) #### 深度学习 依赖 **Pytorch**,框架 **Pytorch_Lightning** - [x] 回归任务(npv混凝土强度数据) From 69a2522ae3bf36c905424ae5157f9777a5242c42 Mon Sep 17 00:00:00 2001 From: jinxianwei <81373517+jinxianwei@users.noreply.github.com> Date: Tue, 3 Oct 2023 19:08:47 +0800 Subject: [PATCH 3/8] Update README.md update image --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 33f8ebc..f4d8d0b 100644 --- a/README.md +++ b/README.md @@ -24,9 +24,8 @@ 不同SVM分类器在测试集决策边界的可视化 ![svm](https://github.com/jinxianwei/CloudImg/assets/81373517/36703295-9af3-406e-b8bb-728c77852bb8) - [x] LogisticRegression with PCA -训练集特征前两个主成分在类别上的可视化 +训练集和测试集特征前两个主成分在类别上的可视化 ![x_train_pca](https://github.com/jinxianwei/CloudImg/assets/81373517/00878756-df1f-4e64-a04b-213371fda10b) -测试集特征前两个主成分在类别上的可视化 ![x_test_pca](https://github.com/jinxianwei/CloudImg/assets/81373517/d14fa1de-e5bf-46f2-8707-91d86bb2be21) #### 深度学习 依赖 **Pytorch**,框架 **Pytorch_Lightning** From cd720521fb8eefc381ddb1856230d4109bdf6264 Mon Sep 17 00:00:00 2001 From: jinxianwie <2948950250@qq.com> Date: Tue, 3 Oct 2023 20:41:16 +0800 Subject: [PATCH 4/8] [Fix] image ax name --- README.md | 2 +- machine_learning/svm/svm.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 33f8ebc..629f665 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ ![k_neighbors](https://github.com/jinxianwei/CloudImg/assets/81373517/4b25b680-c883-48e2-9846-357959fe7363) - [x] SVM-(分类任务) 不同SVM分类器在测试集决策边界的可视化 -![svm](https://github.com/jinxianwei/CloudImg/assets/81373517/36703295-9af3-406e-b8bb-728c77852bb8) +![svm](https://github.com/jinxianwei/CloudImg/assets/81373517/2a154234-ba2a-45d8-88ef-0ea4bd59cabf) - [x] LogisticRegression with PCA 训练集特征前两个主成分在类别上的可视化 ![x_train_pca](https://github.com/jinxianwei/CloudImg/assets/81373517/00878756-df1f-4e64-a04b-213371fda10b) diff --git a/machine_learning/svm/svm.py b/machine_learning/svm/svm.py index a426cae..ee098cd 100644 --- a/machine_learning/svm/svm.py +++ b/machine_learning/svm/svm.py @@ -23,6 +23,7 @@ def main(): # 鸢尾花数据集三类标签 label_dict = {0: 'setosa', 1: 'versicolor', 2: 'virginica'} label_list = ['setosa', 'versicolor', 'virginica'] + feature_name = [val for key, val in name_dict.items()] # 2. 确定特征和标签 x = data[:, :2] # 只选择前两个特征进行训练,为方便可视化 @@ -75,8 +76,8 @@ def main(): cmap=plt.cm.coolwarm, alpha=0.8, ax=ax, - xlabel=label_list[0], - ylabel=label_list[1], + xlabel=feature_name[0], + ylabel=feature_name[1], ) ax.scatter(x_test[:, 0], x_test[:, 1], c=y_test, cmap=plt.cm.coolwarm, s=20, edgecolors="k") ax.set_xticks(()) From 613e630048af9ca2bdc51cf89c8bdf07e74bfedf Mon Sep 17 00:00:00 2001 From: jinxianwie <2948950250@qq.com> Date: Tue, 3 Oct 2023 21:25:25 +0800 Subject: [PATCH 5/8] [Feature] Support draw probability --- README.md | 4 + .../plt_probability.py | 101 ++++++++++++++++++ 2 files changed, 105 insertions(+) create mode 100644 machine_learning/plot_classification_probability/plt_probability.py diff --git a/README.md b/README.md index 677787e..a01df14 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,9 @@ 训练集和测试集特征前两个主成分在类别上的可视化 ![x_train_pca](https://github.com/jinxianwei/CloudImg/assets/81373517/00878756-df1f-4e64-a04b-213371fda10b) ![x_test_pca](https://github.com/jinxianwei/CloudImg/assets/81373517/d14fa1de-e5bf-46f2-8707-91d86bb2be21) +- [x] 绘制不同分类器在测试集上的预测概率 +![prob](https://github.com/jinxianwei/CloudImg/assets/81373517/b498966e-64c9-4c3f-88db-8ff114d29ec8) + #### 深度学习 依赖 **Pytorch**,框架 **Pytorch_Lightning** - [x] 回归任务(npv混凝土强度数据) @@ -47,6 +50,7 @@ python machine_learning/logistic_regression/train.py # 逐步增强法(Adaboost)(分类任务) python machine_learning/adaboost/adaboost_classifier.py +... # 深度回归 python deep_learning/regression/train.py diff --git a/machine_learning/plot_classification_probability/plt_probability.py b/machine_learning/plot_classification_probability/plt_probability.py new file mode 100644 index 0000000..fa54dad --- /dev/null +++ b/machine_learning/plot_classification_probability/plt_probability.py @@ -0,0 +1,101 @@ +import os +import sys +current_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.insert(0, current_path) + +import matplotlib.pyplot as plt +import numpy as np + +from sklearn import datasets +from sklearn.gaussian_process import GaussianProcessClassifier +from sklearn.gaussian_process.kernels import RBF +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score +from sklearn.svm import SVC +from utils.read_csv_data import read_csv_data +from sklearn.model_selection import train_test_split + +def main(): + # 1. 读取csv数据 + name_dict, data = read_csv_data("dataset/iris.csv") + # 鸢尾花数据集三类标签 + label_dict = {0: 'setosa', 1: 'versicolor', 2: 'virginica'} + label_list = ['setosa', 'versicolor', 'virginica'] + feature_name = [val for key, val in name_dict.items()] + + # 2. 确定特征和标签 + x = data[:, :2] # 只选择前两个特征进行训练,为方便可视化 + y = data[:, -1] + + # 3. 划分训练集和测试集 + x_train, x_test, y_train, y_test= train_test_split(x, y, stratify=y, test_size=0.5, random_state=0) + + n_features = x.shape[1] + + # 4. 创建模型 + C = 10 + kernel = 1.0 * RBF([1.0, 1.0]) # for GPC + # 创建不同类别的分类器 + classifiers = { + "L1 logistic": LogisticRegression( + C=C, penalty="l1", solver="saga", multi_class="multinomial", max_iter=10000 + ), + "L2 logistic (Multinomial)": LogisticRegression( + C=C, penalty="l2", solver="saga", multi_class="multinomial", max_iter=10000 + ), + "L2 logistic (OvR)": LogisticRegression( + C=C, penalty="l2", solver="saga", multi_class="ovr", max_iter=10000 + ), + "Linear SVC": SVC(kernel="linear", C=C, probability=True, random_state=0), + "GPC": GaussianProcessClassifier(kernel), + } + + n_classifiers = len(classifiers) + + plt.figure(figsize=(3 * 2, n_classifiers * 2)) + plt.subplots_adjust(bottom=0.2, top=0.95) + + xx = np.linspace(3, 9, 100) # x[:, 0].min() ~ x[:, 0].max() + yy = np.linspace(1, 5, 100).T # x[:, 1].min() ~ x[:, 1].max() 为了可视化样本空间,需要考虑特征的最大最小数值区间 + xx, yy = np.meshgrid(xx, yy) + Xfull = np.c_[xx.ravel(), yy.ravel()] # 正交得到样本空间的待预测点 + + # 5. 对不同模型进行训练 + for index, (name, classifier) in enumerate(classifiers.items()): + # 训练模型 + classifier.fit(x_train, y_train) + # 在测试集上进行预测 + y_pred = classifier.predict(x_test) + accuracy = accuracy_score(y_true=y_test, y_pred=y_pred) + print("Accuracy (test) for %s: %0.1f%% " % (name, accuracy * 100)) + + # View probabilities: + # 对离散空间的所有样本点进行预测 + probas = classifier.predict_proba(Xfull) + n_classes = np.unique(y_pred).size + for k in range(n_classes): + plt.subplot(n_classifiers, n_classes, index * n_classes + k + 1) + plt.title("{}".format(label_dict[k])) + if k == 0: + plt.ylabel(name) + imshow_handle = plt.imshow( + probas[:, k].reshape((100, 100)), extent=(3, 9, 1, 5), origin="lower" + ) + plt.xticks(()) + plt.yticks(()) + idx = y_pred == k # 得到预测类别为k的样本索引True,忽略其他预测类别的样本(检查x_test[idx, 0].shape) + if idx.any(): + # 绘制预测类别为k的测试集样本散点 + plt.scatter(x_test[idx, 0], x_test[idx, 1], marker="o", c="w", edgecolor="k") + if k == 2: + plt.xlabel(feature_name[0]) + plt.ylabel(feature_name[1]) + + ax = plt.axes([0.15, 0.04, 0.7, 0.05]) + plt.title("Probability") + plt.colorbar(imshow_handle, cax=ax, orientation="horizontal") + + plt.savefig('./prob.png') + +if __name__ == "__main__": + main() \ No newline at end of file From 9b7f6c9feb8ee49c64fc1b39cfe856f67952b73c Mon Sep 17 00:00:00 2001 From: jinxianwie <2948950250@qq.com> Date: Wed, 4 Oct 2023 11:38:47 +0800 Subject: [PATCH 6/8] [Feature] Support outlier data detection --- README.md | 4 ++ utils/outlier_detection.py | 94 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) create mode 100644 utils/outlier_detection.py diff --git a/README.md b/README.md index a01df14..5fe27ce 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,10 @@ ![x_test_pca](https://github.com/jinxianwei/CloudImg/assets/81373517/d14fa1de-e5bf-46f2-8707-91d86bb2be21) - [x] 绘制不同分类器在测试集上的预测概率 ![prob](https://github.com/jinxianwei/CloudImg/assets/81373517/b498966e-64c9-4c3f-88db-8ff114d29ec8) +- [x] 异常值检测 +预测为异常点和真实的异常点的可视化 +![outlier_detection_pred](https://github.com/jinxianwei/CloudImg/assets/81373517/0975ce3d-b0bc-41b3-ba28-9b9d7464fbe6) +![outlier_detection](https://github.com/jinxianwei/CloudImg/assets/81373517/09efcfd2-866f-4f9d-b0db-f6988a7855e1) #### 深度学习 依赖 **Pytorch**,框架 **Pytorch_Lightning** diff --git a/utils/outlier_detection.py b/utils/outlier_detection.py new file mode 100644 index 0000000..915aa59 --- /dev/null +++ b/utils/outlier_detection.py @@ -0,0 +1,94 @@ +import numpy as np +from sklearn.neighbors import LocalOutlierFactor +import matplotlib.pyplot as plt +from matplotlib.legend_handler import HandlerPathCollection + +def update_legend_marker_size(handle, orig): + "Customize size of the legend marker" + handle.update_from(orig) + handle.set_sizes([20]) + +# 数据异常值检测 +def main(): + # 1. 数据构造 + np.random.seed(42) + # 构造正常值数据 + X_inliers = 0.3 * np.random.randn(100, 2) + X_inliers = np.r_[X_inliers + 2, X_inliers - 2] + # 构造异常值数据 + X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2)) + # 拼接数据 + X = np.r_[X_inliers, X_outliers] + + # 为数据定义标签(1: 正常值,-1: 异常值) + n_outliers = len(X_outliers) + ground_truth = np.ones(len(X), dtype=int) + ground_truth[-n_outliers:] = -1 + + + # 2. 建立异常值检测模型 + clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1) + # 预测数据中哪些为异常值(1: 正常值, -1: 异常值) + y_pred = clf.fit_predict(X) + # 查看异常值数据的索引和个数 + outlier_indices = np.where(y_pred == -1) + num_pred_outlier = outlier_indices[0].shape + print("根据模型预测,总体数据中异常值数据的个数为{}, 其数据标号为{}".format(num_pred_outlier, outlier_indices)) + # 查看误差 + n_errors = (y_pred != ground_truth).sum() + # 查看每个数据点为异常值的分数 + X_scores = clf.negative_outlier_factor_ + + # 3. 对ground_truth和pred的数据进行可视化 + # 3.1 将正常值标记为红色,异常值标记为蓝色(gt) + plt.scatter(X[:, 0], X[:, 1], c=ground_truth, cmap=plt.cm.coolwarm, s=10, label="Data points") + # plot circles with radius proportional to the outlier scores + radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min()) + # 将数据预测的异常值分数转为半径,绘制在数据点上(半径越大,该数据点为异常值的概率越大) + scatter = plt.scatter( + X[:, 0], + X[:, 1], + s=1000 * radius, + edgecolors="r", + facecolors="none", + label="Outlier scores", + ) + plt.axis("tight") + plt.xlim((-5, 5)) + plt.ylim((-5, 5)) + plt.xlabel("prediction errors: %d" % (n_errors)) + plt.legend( + handler_map={scatter: HandlerPathCollection(update_func=update_legend_marker_size)} + ) + plt.title("Local Outlier Factor (LOF)") + plt.savefig('./outlier_detection.png') + + plt.clf() # 清空图像 + + # 3.2 将正常值标记为红色,异常值标记为蓝色(y_pred) + plt.scatter(X[:, 0], X[:, 1], c=y_pred, cmap=plt.cm.coolwarm, s=10, label="Data points") + # plot circles with radius proportional to the outlier scores + radius = (X_scores.max() - X_scores) / (X_scores.max() - X_scores.min()) + # 将数据预测的异常值分数转为半径,绘制在数据点上(半径越大,该数据点为异常值的概率越大) + scatter = plt.scatter( + X[:, 0], + X[:, 1], + s=1000 * radius, + edgecolors="r", + facecolors="none", + label="Outlier scores", + ) + plt.axis("tight") + plt.xlim((-5, 5)) + plt.ylim((-5, 5)) + plt.xlabel("prediction errors: %d" % (n_errors)) + plt.legend( + handler_map={scatter: HandlerPathCollection(update_func=update_legend_marker_size)} + ) + plt.title("Local Outlier Factor (LOF)") + plt.savefig('./outlier_detection_pred.png') + + + +if __name__ == "__main__": + main() \ No newline at end of file From af4fed9eaedeab18aa37e8e16fa0a6da8a29adf9 Mon Sep 17 00:00:00 2001 From: jinxianwie <2948950250@qq.com> Date: Wed, 4 Oct 2023 11:39:29 +0800 Subject: [PATCH 7/8] [WIP] iris data outlier detection --- utils/outlier_detection_iris.py | 37 +++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 utils/outlier_detection_iris.py diff --git a/utils/outlier_detection_iris.py b/utils/outlier_detection_iris.py new file mode 100644 index 0000000..5ac573a --- /dev/null +++ b/utils/outlier_detection_iris.py @@ -0,0 +1,37 @@ +import numpy as np +from sklearn.neighbors import LocalOutlierFactor +from read_csv_data import read_csv_data +from sklearn.model_selection import GridSearchCV + +def main(): + # 1. 读取csv数据 + name_dict, data = read_csv_data("dataset/iris.csv") + # 2. 建立异常值检测模型, 其中模型的参数需要适当调整 + clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1) + # 预测数据中哪些为异常值(1: 正常值, -1: 异常值) + y_pred = clf.fit_predict(data) + # 查看异常值数据的索引和个数 + outlier_indices = np.where(y_pred == -1) + num_pred_outlier = outlier_indices[0].shape + print("根据模型预测,总体数据中异常值数据的个数为{}, 其数据标号为{}".format(num_pred_outlier, outlier_indices)) + + # TODO 3. 找一个最合适的模型参数 + # 定义参数范围 + param_grid = {'n_neighbors': [5, 10, 15, 20], + 'contamination': [0.05, 0.1, 0.15, 0.2]} + # 创建 LOF 模型 + lof = LocalOutlierFactor() + # 使用 GridSearchCV 寻找最佳参数 + grid_search = GridSearchCV(lof, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5) + grid_search.fit(data) + + # 输出最佳参数 + print("Best parameters:", grid_search.best_params_) + + + + + + +if __name__ == "__main__": + main() \ No newline at end of file From 86b0a07ae6382dd0382d3412462c1abd19074558 Mon Sep 17 00:00:00 2001 From: jinxianwie <2948950250@qq.com> Date: Wed, 4 Oct 2023 18:39:16 +0800 Subject: [PATCH 8/8] update --- utils/outlier_detection_iris.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/outlier_detection_iris.py b/utils/outlier_detection_iris.py index 5ac573a..7e9a225 100644 --- a/utils/outlier_detection_iris.py +++ b/utils/outlier_detection_iris.py @@ -15,7 +15,7 @@ def main(): num_pred_outlier = outlier_indices[0].shape print("根据模型预测,总体数据中异常值数据的个数为{}, 其数据标号为{}".format(num_pred_outlier, outlier_indices)) - # TODO 3. 找一个最合适的模型参数 + # TODO 3. 找一个比较合适的模型参数 # 定义参数范围 param_grid = {'n_neighbors': [5, 10, 15, 20], 'contamination': [0.05, 0.1, 0.15, 0.2]}