Scikit Learn库的学习历程:分类预测
16 min read
Page Views
1.原始数据
这次使用的是鸢尾花数据集,前四列依次为:花萼长度、花萼宽度、花瓣长度、花瓣宽度,最后一列为鸢尾花的品种,0表示setosa,1表示versicolor,2表示virginica,具体数据如下所示:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
5.1 3.5 1.4 0.2 0
4.9 3 1.4 0.2 0
4.7 3.2 1.3 0.2 0
4.6 3.1 1.5 0.2 0
5 3.6 1.4 0.2 0
5.4 3.9 1.7 0.4 0
4.6 3.4 1.4 0.3 0
5 3.4 1.5 0.2 0
4.4 2.9 1.4 0.2 0
4.9 3.1 1.5 0.1 0
5.4 3.7 1.5 0.2 0
4.8 3.4 1.6 0.2 0
4.8 3 1.4 0.1 0
4.3 3 1.1 0.1 0
5.8 4 1.2 0.2 0
5.7 4.4 1.5 0.4 0
5.4 3.9 1.3 0.4 0
5.1 3.5 1.4 0.3 0
5.7 3.8 1.7 0.3 0
5.1 3.8 1.5 0.3 0
5.4 3.4 1.7 0.2 0
5.1 3.7 1.5 0.4 0
4.6 3.6 1 0.2 0
5.1 3.3 1.7 0.5 0
4.8 3.4 1.9 0.2 0
5 3 1.6 0.2 0
5 3.4 1.6 0.4 0
5.2 3.5 1.5 0.2 0
5.2 3.4 1.4 0.2 0
4.7 3.2 1.6 0.2 0
4.8 3.1 1.6 0.2 0
5.4 3.4 1.5 0.4 0
5.2 4.1 1.5 0.1 0
5.5 4.2 1.4 0.2 0
4.9 3.1 1.5 0.2 0
5 3.2 1.2 0.2 0
5.5 3.5 1.3 0.2 0
4.9 3.6 1.4 0.1 0
4.4 3 1.3 0.2 0
5.1 3.4 1.5 0.2 0
5 3.5 1.3 0.3 0
4.5 2.3 1.3 0.3 0
4.4 3.2 1.3 0.2 0
5 3.5 1.6 0.6 0
5.1 3.8 1.9 0.4 0
4.8 3 1.4 0.3 0
5.1 3.8 1.6 0.2 0
4.6 3.2 1.4 0.2 0
5.3 3.7 1.5 0.2 0
5 3.3 1.4 0.2 0
7 3.2 4.7 1.4 1
6.4 3.2 4.5 1.5 1
6.9 3.1 4.9 1.5 1
5.5 2.3 4 1.3 1
6.5 2.8 4.6 1.5 1
5.7 2.8 4.5 1.3 1
6.3 3.3 4.7 1.6 1
4.9 2.4 3.3 1 1
6.6 2.9 4.6 1.3 1
5.2 2.7 3.9 1.4 1
5 2 3.5 1 1
5.9 3 4.2 1.5 1
6 2.2 4 1 1
6.1 2.9 4.7 1.4 1
5.6 2.9 3.6 1.3 1
6.7 3.1 4.4 1.4 1
5.6 3 4.5 1.5 1
5.8 2.7 4.1 1 1
6.2 2.2 4.5 1.5 1
5.6 2.5 3.9 1.1 1
5.9 3.2 4.8 1.8 1
6.1 2.8 4 1.3 1
6.3 2.5 4.9 1.5 1
6.1 2.8 4.7 1.2 1
6.4 2.9 4.3 1.3 1
6.6 3 4.4 1.4 1
6.8 2.8 4.8 1.4 1
6.7 3 5 1.7 1
6 2.9 4.5 1.5 1
5.7 2.6 3.5 1 1
5.5 2.4 3.8 1.1 1
5.5 2.4 3.7 1 1
5.8 2.7 3.9 1.2 1
6 2.7 5.1 1.6 1
5.4 3 4.5 1.5 1
6 3.4 4.5 1.6 1
6.7 3.1 4.7 1.5 1
6.3 2.3 4.4 1.3 1
5.6 3 4.1 1.3 1
5.5 2.5 4 1.3 1
5.5 2.6 4.4 1.2 1
6.1 3 4.6 1.4 1
5.8 2.6 4 1.2 1
5 2.3 3.3 1 1
5.6 2.7 4.2 1.3 1
5.7 3 4.2 1.2 1
5.7 2.9 4.2 1.3 1
6.2 2.9 4.3 1.3 1
5.1 2.5 3 1.1 1
5.7 2.8 4.1 1.3 1
6.3 3.3 6 2.5 2
5.8 2.7 5.1 1.9 2
7.1 3 5.9 2.1 2
6.3 2.9 5.6 1.8 2
6.5 3 5.8 2.2 2
7.6 3 6.6 2.1 2
4.9 2.5 4.5 1.7 2
7.3 2.9 6.3 1.8 2
6.7 2.5 5.8 1.8 2
7.2 3.6 6.1 2.5 2
6.5 3.2 5.1 2 2
6.4 2.7 5.3 1.9 2
6.8 3 5.5 2.1 2
5.7 2.5 5 2 2
5.8 2.8 5.1 2.4 2
6.4 3.2 5.3 2.3 2
6.5 3 5.5 1.8 2
7.7 3.8 6.7 2.2 2
7.7 2.6 6.9 2.3 2
6 2.2 5 1.5 2
6.9 3.2 5.7 2.3 2
5.6 2.8 4.9 2 2
7.7 2.8 6.7 2 2
6.3 2.7 4.9 1.8 2
6.7 3.3 5.7 2.1 2
7.2 3.2 6 1.8 2
6.2 2.8 4.8 1.8 2
6.1 3 4.9 1.8 2
6.4 2.8 5.6 2.1 2
7.2 3 5.8 1.6 2
7.4 2.8 6.1 1.9 2
7.9 3.8 6.4 2 2
6.4 2.8 5.6 2.2 2
6.3 2.8 5.1 1.5 2
6.1 2.6 5.6 1.4 2
7.7 3 6.1 2.3 2
6.3 3.4 5.6 2.4 2
6.4 3.1 5.5 1.8 2
6 3 4.8 1.8 2
6.9 3.1 5.4 2.1 2
6.7 3.1 5.6 2.4 2
6.9 3.1 5.1 2.3 2
5.8 2.7 5.1 1.9 2
6.8 3.2 5.9 2.3 2
6.7 3.3 5.7 2.5 2
6.7 3 5.2 2.3 2
6.3 2.5 5 1.9 2
6.5 3 5.2 2 2
6.2 3.4 5.4 2.3 2
5.9 3 5.1 1.8 2
2.python程序
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
pd.set_option('display.max_columns', None)
pd.set_option('expand_frame_repr', False)
iris_dataset = load_iris()
data = pd.DataFrame(iris_dataset.get('data'), columns=iris_dataset.get('feature_names'))
data['target'] = iris_dataset.get('target')
data.to_excel('iris dataset.xlsx', index=False)
print(data)
X = data.iloc[:, :-1]
y = data.iloc[:, -1]
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=1)
# 岭分类
model = RidgeClassifier(alpha=1.0, fit_intercept=True, copy_X=True, max_iter=5000, random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n岭分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
columns=iris_dataset.get('target_names')))
# 逻辑回归分类
model = LogisticRegression(fit_intercept=True, max_iter=5000, random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n逻辑回归分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
columns=iris_dataset.get('target_names')))
# 支持向量机分类
model = LinearSVC(dual='auto', fit_intercept=True, max_iter=5000, random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n支持向量机分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
columns=iris_dataset.get('target_names')))
# 随机梯度下降分类
model = SGDClassifier(loss='hinge', fit_intercept=True, max_iter=5000, random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n随机梯度下降分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
columns=iris_dataset.get('target_names')))
# 梯度增强机分类
model = GradientBoostingClassifier(loss='log_loss', random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n梯度增强机分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
columns=iris_dataset.get('target_names')))
# 轻度增强机分类
model = LGBMClassifier(random_state=1, verbose=-1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n轻度增强机分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
columns=iris_dataset.get('target_names')))
# 最近邻分类
model = KNeighborsClassifier(algorithm='auto')
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n最近邻分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
columns=iris_dataset.get('target_names')))
# 高斯过程分类
model = GaussianProcessClassifier(random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n高斯过程分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
columns=iris_dataset.get('target_names')))
# 高斯朴素贝叶斯分类
model = GaussianNB()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n高斯朴素贝叶斯分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
columns=iris_dataset.get('target_names')))
# 决策树分类
model = DecisionTreeClassifier(random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n决策树分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
columns=iris_dataset.get('target_names')))
# 额外树分类
model = ExtraTreesClassifier(random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n额外树分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
columns=iris_dataset.get('target_names')))
# 随机森林分类
model = RandomForestClassifier(random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n随机森林分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
columns=iris_dataset.get('target_names')))
# 神经网络分类
model = MLPClassifier(max_iter=5000, random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n神经网络分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
columns=iris_dataset.get('target_names')))
# 自适应增强分类
model = AdaBoostClassifier(random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n自适应增强分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
columns=iris_dataset.get('target_names')))
# 极限梯度增强分类
model = XGBClassifier(random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n极限梯度增强分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
columns=iris_dataset.get('target_names')))
3.输出结果
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) target
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0
.. ... ... ... ... ...
145 6.7 3.0 5.2 2.3 2
146 6.3 2.5 5.0 1.9 2
147 6.5 3.0 5.2 2.0 2
148 6.2 3.4 5.4 2.3 2
149 5.9 3.0 5.1 1.8 2
[150 rows x 5 columns]
岭分类:
precision recall f1-score support
0 1.00 1.00 1.00 14
1 0.90 0.50 0.64 18
2 0.57 0.92 0.71 13
accuracy 0.78 45
macro avg 0.82 0.81 0.78 45
weighted avg 0.84 0.78 0.77 45
混淆矩阵:
setosa versicolor virginica
setosa 14 0 0
versicolor 0 9 9
virginica 0 1 12
逻辑回归分类:
precision recall f1-score support
0 1.00 1.00 1.00 14
1 0.94 0.83 0.88 18
2 0.80 0.92 0.86 13
accuracy 0.91 45
macro avg 0.91 0.92 0.91 45
weighted avg 0.92 0.91 0.91 45
混淆矩阵:
setosa versicolor virginica
setosa 14 0 0
versicolor 0 15 3
virginica 0 1 12
支持向量机分类:
precision recall f1-score support
0 1.00 1.00 1.00 14
1 0.92 0.67 0.77 18
2 0.67 0.92 0.77 13
accuracy 0.84 45
macro avg 0.86 0.86 0.85 45
weighted avg 0.87 0.84 0.84 45
混淆矩阵:
setosa versicolor virginica
setosa 14 0 0
versicolor 0 12 6
virginica 0 1 12
随机梯度下降分类:
precision recall f1-score support
0 1.00 1.00 1.00 14
1 0.94 0.89 0.91 18
2 0.86 0.92 0.89 13
accuracy 0.93 45
macro avg 0.93 0.94 0.93 45
weighted avg 0.94 0.93 0.93 45
混淆矩阵:
setosa versicolor virginica
setosa 14 0 0
versicolor 0 16 2
virginica 0 1 12
梯度增强机分类:
precision recall f1-score support
0 1.00 1.00 1.00 14
1 0.94 0.94 0.94 18
2 0.92 0.92 0.92 13
accuracy 0.96 45
macro avg 0.96 0.96 0.96 45
weighted avg 0.96 0.96 0.96 45
混淆矩阵:
setosa versicolor virginica
setosa 14 0 0
versicolor 0 17 1
virginica 0 1 12
轻度增强机分类:
precision recall f1-score support
0 1.00 1.00 1.00 14
1 0.94 0.94 0.94 18
2 0.92 0.92 0.92 13
accuracy 0.96 45
macro avg 0.96 0.96 0.96 45
weighted avg 0.96 0.96 0.96 45
混淆矩阵:
setosa versicolor virginica
setosa 14 0 0
versicolor 0 17 1
virginica 0 1 12
最近邻分类:
precision recall f1-score support
0 1.00 1.00 1.00 14
1 0.94 0.94 0.94 18
2 0.92 0.92 0.92 13
accuracy 0.96 45
macro avg 0.96 0.96 0.96 45
weighted avg 0.96 0.96 0.96 45
混淆矩阵:
setosa versicolor virginica
setosa 14 0 0
versicolor 0 17 1
virginica 0 1 12
高斯过程分类:
precision recall f1-score support
0 1.00 1.00 1.00 14
1 0.93 0.78 0.85 18
2 0.75 0.92 0.83 13
accuracy 0.89 45
macro avg 0.89 0.90 0.89 45
weighted avg 0.90 0.89 0.89 45
混淆矩阵:
setosa versicolor virginica
setosa 14 0 0
versicolor 0 14 4
virginica 0 1 12
高斯朴素贝叶斯分类:
precision recall f1-score support
0 1.00 1.00 1.00 14
1 0.94 0.89 0.91 18
2 0.86 0.92 0.89 13
accuracy 0.93 45
macro avg 0.93 0.94 0.93 45
weighted avg 0.94 0.93 0.93 45
混淆矩阵:
setosa versicolor virginica
setosa 14 0 0
versicolor 0 16 2
virginica 0 1 12
决策树分类:
precision recall f1-score support
0 1.00 1.00 1.00 14
1 0.94 0.94 0.94 18
2 0.92 0.92 0.92 13
accuracy 0.96 45
macro avg 0.96 0.96 0.96 45
weighted avg 0.96 0.96 0.96 45
混淆矩阵:
setosa versicolor virginica
setosa 14 0 0
versicolor 0 17 1
virginica 0 1 12
额外树分类:
precision recall f1-score support
0 1.00 1.00 1.00 14
1 0.94 0.94 0.94 18
2 0.92 0.92 0.92 13
accuracy 0.96 45
macro avg 0.96 0.96 0.96 45
weighted avg 0.96 0.96 0.96 45
混淆矩阵:
setosa versicolor virginica
setosa 14 0 0
versicolor 0 17 1
virginica 0 1 12
随机森林分类:
precision recall f1-score support
0 1.00 1.00 1.00 14
1 0.94 0.94 0.94 18
2 0.92 0.92 0.92 13
accuracy 0.96 45
macro avg 0.96 0.96 0.96 45
weighted avg 0.96 0.96 0.96 45
混淆矩阵:
setosa versicolor virginica
setosa 14 0 0
versicolor 0 17 1
virginica 0 1 12
神经网络分类:
precision recall f1-score support
0 1.00 1.00 1.00 14
1 1.00 0.94 0.97 18
2 0.93 1.00 0.96 13
accuracy 0.98 45
macro avg 0.98 0.98 0.98 45
weighted avg 0.98 0.98 0.98 45
混淆矩阵:
setosa versicolor virginica
setosa 14 0 0
versicolor 0 17 1
virginica 0 0 13
自适应增强分类:
precision recall f1-score support
0 1.00 1.00 1.00 14
1 0.94 0.94 0.94 18
2 0.92 0.92 0.92 13
accuracy 0.96 45
macro avg 0.96 0.96 0.96 45
weighted avg 0.96 0.96 0.96 45
混淆矩阵:
setosa versicolor virginica
setosa 14 0 0
versicolor 0 17 1
virginica 0 1 12
极限梯度增强分类:
precision recall f1-score support
0 0.93 1.00 0.97 14
1 0.94 0.89 0.91 18
2 0.92 0.92 0.92 13
accuracy 0.93 45
macro avg 0.93 0.94 0.93 45
weighted avg 0.93 0.93 0.93 45
混淆矩阵:
setosa versicolor virginica
setosa 14 0 0
versicolor 1 16 1
virginica 0 1 12
Last updated on 2025-06-21