# 先运行之前预处理好的代码 import pandas as pd import pandas as pd #用于数据处理和分析可处理表格数据。 import numpy as np #用于数值计算提供了高效的数组操作。 import matplotlib.pyplot as plt #用于绘制各种类型的图表 import seaborn as sns #基于matplotlib的高级绘图库能绘制更美观的统计图形。 import warnings warnings.filterwarnings(ignore) # 设置中文字体解决中文显示问题 plt.rcParams[font.sans-serif] [SimHei] # Windows系统常用黑体字体 plt.rcParams[axes.unicode_minus] False # 正常显示负号 data pd.read_csv(rE:\02_Data\06_Python Program\python60-days-challenge-master\python60-days-challenge-master\heart.csv) #读取数据 from sklearn.model_selection import train_test_split X data.drop([target], axis1) # 特征axis1表示按列删除 y data[target] # 标签 # # 按照8:2划分训练集和测试集 X_train, X_test, y_train, y_test train_test_split(X, y, test_size0.2, random_state42) # 80%训练集20%测试集 data.head() from sklearn.ensemble import RandomForestClassifier #随机森林分类器 from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 用于评估分类器性能的指标 from sklearn.metrics import classification_report, confusion_matrix #用于生成分类报告和混淆矩阵 import warnings #用于忽略警告信息 warnings.filterwarnings(ignore) # 忽略所有警告信息 import time start_timetime.time() rf_modelRandomForestClassifier(random_state42) rf_model.fit(X_train,y_train) rf_predrf_model.predict(X_test) end_timetime.time() print(f耗费时间{end_time-start_time}) print(f随机森林在测试集上的分类报告) print(classification_report(y_test,rf_pred)) print(随机森林在测试集上的混淆矩阵) print(confusion_matrix(y_test,rf_pred)) print(方差筛选) from sklearn.feature_selection import VarianceThreshold import time start_timetime.time() selectorVarianceThreshold(threshold0.01) X_train_varselector.fit_transform(X_train) X_test_varselector.transform(X_test) selected_features_varX_train.columns[selector.get_support()].tolist() print(f筛选后特征数量{len(selected_features_var)}) print(f筛选后的特征{selected_features_var}) rf_model_varRandomForestClassifier(random_state42) rf_model_var.fit(X_train_var,y_train) rf_pred_varrf_model_var.predict(X_test) end_timetime.time() print(f训练耗时{end_time-start_time}) print(f训练报告{classification_report(y_test,rf_pred_var)}) print(f在测试集上的混淆矩阵{confusion_matrix(y_test,rf_pred_var)}) print(皮尔逊相关系数筛选) from sklearn.feature_selection import SelectKBest,f_classif start_timetime.time() k10 selectorSelectKBest(score_funcf_classif,kk) X_train_corrselector.fit_transform(X_train,y_train) X_test_corrselector.transform(X_test) selected_features_corrX_train.columns[selector.get_support()].tolist() print(f皮尔逊相关系数筛选后特征数量{len(selected_features_corr)}) print(f皮尔逊相关系数筛选后特征{selected_features_corr}) rf_model_corrRandomForestClassifier(random_state42) rf_model_corr.fit(X_train_corr,y_train) rf_pred_corrrf_model_corr.predict(X_test_corr) end_timetime.time() print(f耗费时间{end_time-start_time}) print(f预测报告{classification_report(y_test,rf_pred_corr)}) print(f混淆矩阵) print(confusion_matrix(y_test,rf_pred_corr)) print(---lasso筛选---) from sklearn.linear_model import Lasso from sklearn.feature_selection import SelectFromModel import time start_timetime.time() lassoLasso(alpha0.01,random_state42) # 很轻度的正则化 seletorSelectFromModel(lasso) selector.fit(X_train,y_train) X_train_lassoselector.fit_transform(X_train,y_train) X_test_lassoselector.transform(X_test) selected_features_lassoX_train.columns[selector.get_support()].tolist() print(f筛选后特征个数{len(selected_features_lasso)}) print(f筛选后特征{selected_features_lasso}) rf_model_lassoRandomForestClassifier(random_state42) rf_model_lasso.fit(X_train_lasso,y_train) rf_pred_lassorf_model_lasso.predict(X_test_lasso) end_timetime.time() print(f训练时间{end_time-start_time}) print(f训练报告{classification_report(y_test,rf_pred_lasso)}) print(f混淆矩阵) print(confusion_matrix(y_test,rf_pred_lasso))浙大疏锦行