Scikit Learn库的学习历程:分类预测

16 min read Page Views

1.原始数据

这次使用的是鸢尾花数据集,前四列依次为:花萼长度、花萼宽度、花瓣长度、花瓣宽度,最后一列为鸢尾花的品种,0表示setosa,1表示versicolor,2表示virginica,具体数据如下所示:

sepal length (cm)	sepal width (cm)	petal length (cm)	petal width (cm)	target
5.1	3.5	1.4	0.2	0
4.9	3	1.4	0.2	0
4.7	3.2	1.3	0.2	0
4.6	3.1	1.5	0.2	0
5	3.6	1.4	0.2	0
5.4	3.9	1.7	0.4	0
4.6	3.4	1.4	0.3	0
5	3.4	1.5	0.2	0
4.4	2.9	1.4	0.2	0
4.9	3.1	1.5	0.1	0
5.4	3.7	1.5	0.2	0
4.8	3.4	1.6	0.2	0
4.8	3	1.4	0.1	0
4.3	3	1.1	0.1	0
5.8	4	1.2	0.2	0
5.7	4.4	1.5	0.4	0
5.4	3.9	1.3	0.4	0
5.1	3.5	1.4	0.3	0
5.7	3.8	1.7	0.3	0
5.1	3.8	1.5	0.3	0
5.4	3.4	1.7	0.2	0
5.1	3.7	1.5	0.4	0
4.6	3.6	1	0.2	0
5.1	3.3	1.7	0.5	0
4.8	3.4	1.9	0.2	0
5	3	1.6	0.2	0
5	3.4	1.6	0.4	0
5.2	3.5	1.5	0.2	0
5.2	3.4	1.4	0.2	0
4.7	3.2	1.6	0.2	0
4.8	3.1	1.6	0.2	0
5.4	3.4	1.5	0.4	0
5.2	4.1	1.5	0.1	0
5.5	4.2	1.4	0.2	0
4.9	3.1	1.5	0.2	0
5	3.2	1.2	0.2	0
5.5	3.5	1.3	0.2	0
4.9	3.6	1.4	0.1	0
4.4	3	1.3	0.2	0
5.1	3.4	1.5	0.2	0
5	3.5	1.3	0.3	0
4.5	2.3	1.3	0.3	0
4.4	3.2	1.3	0.2	0
5	3.5	1.6	0.6	0
5.1	3.8	1.9	0.4	0
4.8	3	1.4	0.3	0
5.1	3.8	1.6	0.2	0
4.6	3.2	1.4	0.2	0
5.3	3.7	1.5	0.2	0
5	3.3	1.4	0.2	0
7	3.2	4.7	1.4	1
6.4	3.2	4.5	1.5	1
6.9	3.1	4.9	1.5	1
5.5	2.3	4	1.3	1
6.5	2.8	4.6	1.5	1
5.7	2.8	4.5	1.3	1
6.3	3.3	4.7	1.6	1
4.9	2.4	3.3	1	1
6.6	2.9	4.6	1.3	1
5.2	2.7	3.9	1.4	1
5	2	3.5	1	1
5.9	3	4.2	1.5	1
6	2.2	4	1	1
6.1	2.9	4.7	1.4	1
5.6	2.9	3.6	1.3	1
6.7	3.1	4.4	1.4	1
5.6	3	4.5	1.5	1
5.8	2.7	4.1	1	1
6.2	2.2	4.5	1.5	1
5.6	2.5	3.9	1.1	1
5.9	3.2	4.8	1.8	1
6.1	2.8	4	1.3	1
6.3	2.5	4.9	1.5	1
6.1	2.8	4.7	1.2	1
6.4	2.9	4.3	1.3	1
6.6	3	4.4	1.4	1
6.8	2.8	4.8	1.4	1
6.7	3	5	1.7	1
6	2.9	4.5	1.5	1
5.7	2.6	3.5	1	1
5.5	2.4	3.8	1.1	1
5.5	2.4	3.7	1	1
5.8	2.7	3.9	1.2	1
6	2.7	5.1	1.6	1
5.4	3	4.5	1.5	1
6	3.4	4.5	1.6	1
6.7	3.1	4.7	1.5	1
6.3	2.3	4.4	1.3	1
5.6	3	4.1	1.3	1
5.5	2.5	4	1.3	1
5.5	2.6	4.4	1.2	1
6.1	3	4.6	1.4	1
5.8	2.6	4	1.2	1
5	2.3	3.3	1	1
5.6	2.7	4.2	1.3	1
5.7	3	4.2	1.2	1
5.7	2.9	4.2	1.3	1
6.2	2.9	4.3	1.3	1
5.1	2.5	3	1.1	1
5.7	2.8	4.1	1.3	1
6.3	3.3	6	2.5	2
5.8	2.7	5.1	1.9	2
7.1	3	5.9	2.1	2
6.3	2.9	5.6	1.8	2
6.5	3	5.8	2.2	2
7.6	3	6.6	2.1	2
4.9	2.5	4.5	1.7	2
7.3	2.9	6.3	1.8	2
6.7	2.5	5.8	1.8	2
7.2	3.6	6.1	2.5	2
6.5	3.2	5.1	2	2
6.4	2.7	5.3	1.9	2
6.8	3	5.5	2.1	2
5.7	2.5	5	2	2
5.8	2.8	5.1	2.4	2
6.4	3.2	5.3	2.3	2
6.5	3	5.5	1.8	2
7.7	3.8	6.7	2.2	2
7.7	2.6	6.9	2.3	2
6	2.2	5	1.5	2
6.9	3.2	5.7	2.3	2
5.6	2.8	4.9	2	2
7.7	2.8	6.7	2	2
6.3	2.7	4.9	1.8	2
6.7	3.3	5.7	2.1	2
7.2	3.2	6	1.8	2
6.2	2.8	4.8	1.8	2
6.1	3	4.9	1.8	2
6.4	2.8	5.6	2.1	2
7.2	3	5.8	1.6	2
7.4	2.8	6.1	1.9	2
7.9	3.8	6.4	2	2
6.4	2.8	5.6	2.2	2
6.3	2.8	5.1	1.5	2
6.1	2.6	5.6	1.4	2
7.7	3	6.1	2.3	2
6.3	3.4	5.6	2.4	2
6.4	3.1	5.5	1.8	2
6	3	4.8	1.8	2
6.9	3.1	5.4	2.1	2
6.7	3.1	5.6	2.4	2
6.9	3.1	5.1	2.3	2
5.8	2.7	5.1	1.9	2
6.8	3.2	5.9	2.3	2
6.7	3.3	5.7	2.5	2
6.7	3	5.2	2.3	2
6.3	2.5	5	1.9	2
6.5	3	5.2	2	2
6.2	3.4	5.4	2.3	2
5.9	3	5.1	1.8	2

2.python程序

import pandas as pd
from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)
pd.set_option('expand_frame_repr', False)

iris_dataset = load_iris()
data = pd.DataFrame(iris_dataset.get('data'), columns=iris_dataset.get('feature_names'))
data['target'] = iris_dataset.get('target')
data.to_excel('iris dataset.xlsx', index=False)
print(data)

X = data.iloc[:, :-1]
y = data.iloc[:, -1]
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=1)

# 岭分类
model = RidgeClassifier(alpha=1.0, fit_intercept=True, copy_X=True, max_iter=5000, random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n岭分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
      pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
                   columns=iris_dataset.get('target_names')))

# 逻辑回归分类
model = LogisticRegression(fit_intercept=True, max_iter=5000, random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n逻辑回归分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
      pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
                   columns=iris_dataset.get('target_names')))

# 支持向量机分类
model = LinearSVC(dual='auto', fit_intercept=True, max_iter=5000, random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n支持向量机分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
      pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
                   columns=iris_dataset.get('target_names')))

# 随机梯度下降分类
model = SGDClassifier(loss='hinge', fit_intercept=True, max_iter=5000, random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n随机梯度下降分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
      pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
                   columns=iris_dataset.get('target_names')))

# 梯度增强机分类
model = GradientBoostingClassifier(loss='log_loss', random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n梯度增强机分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
      pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
                   columns=iris_dataset.get('target_names')))

# 轻度增强机分类
model = LGBMClassifier(random_state=1, verbose=-1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n轻度增强机分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
      pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
                   columns=iris_dataset.get('target_names')))

# 最近邻分类
model = KNeighborsClassifier(algorithm='auto')
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n最近邻分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
      pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
                   columns=iris_dataset.get('target_names')))

# 高斯过程分类
model = GaussianProcessClassifier(random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n高斯过程分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
      pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
                   columns=iris_dataset.get('target_names')))

# 高斯朴素贝叶斯分类
model = GaussianNB()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n高斯朴素贝叶斯分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
      pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
                   columns=iris_dataset.get('target_names')))

# 决策树分类
model = DecisionTreeClassifier(random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n决策树分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
      pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
                   columns=iris_dataset.get('target_names')))

# 额外树分类
model = ExtraTreesClassifier(random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n额外树分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
      pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
                   columns=iris_dataset.get('target_names')))

# 随机森林分类
model = RandomForestClassifier(random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n随机森林分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
      pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
                   columns=iris_dataset.get('target_names')))

# 神经网络分类
model = MLPClassifier(max_iter=5000, random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n神经网络分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
      pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
                   columns=iris_dataset.get('target_names')))

# 自适应增强分类
model = AdaBoostClassifier(random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n自适应增强分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
      pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
                   columns=iris_dataset.get('target_names')))

# 极限梯度增强分类
model = XGBClassifier(random_state=1)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)
print('\n极限梯度增强分类:\n', classification_report(y_test, y_predict))
print('混淆矩阵:\n',
      pd.DataFrame(confusion_matrix(y_true=y_test, y_pred=y_predict), index=iris_dataset.get('target_names'),
                   columns=iris_dataset.get('target_names')))

3.输出结果

     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  target
0                  5.1               3.5                1.4               0.2       0
1                  4.9               3.0                1.4               0.2       0
2                  4.7               3.2                1.3               0.2       0
3                  4.6               3.1                1.5               0.2       0
4                  5.0               3.6                1.4               0.2       0
..                 ...               ...                ...               ...     ...
145                6.7               3.0                5.2               2.3       2
146                6.3               2.5                5.0               1.9       2
147                6.5               3.0                5.2               2.0       2
148                6.2               3.4                5.4               2.3       2
149                5.9               3.0                5.1               1.8       2

[150 rows x 5 columns]

岭分类
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.90      0.50      0.64        18
           2       0.57      0.92      0.71        13

    accuracy                           0.78        45
   macro avg       0.82      0.81      0.78        45
weighted avg       0.84      0.78      0.77        45

混淆矩阵
             setosa  versicolor  virginica
setosa          14           0          0
versicolor       0           9          9
virginica        0           1         12

逻辑回归分类
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.94      0.83      0.88        18
           2       0.80      0.92      0.86        13

    accuracy                           0.91        45
   macro avg       0.91      0.92      0.91        45
weighted avg       0.92      0.91      0.91        45

混淆矩阵
             setosa  versicolor  virginica
setosa          14           0          0
versicolor       0          15          3
virginica        0           1         12

支持向量机分类
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.92      0.67      0.77        18
           2       0.67      0.92      0.77        13

    accuracy                           0.84        45
   macro avg       0.86      0.86      0.85        45
weighted avg       0.87      0.84      0.84        45

混淆矩阵
             setosa  versicolor  virginica
setosa          14           0          0
versicolor       0          12          6
virginica        0           1         12

随机梯度下降分类
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.94      0.89      0.91        18
           2       0.86      0.92      0.89        13

    accuracy                           0.93        45
   macro avg       0.93      0.94      0.93        45
weighted avg       0.94      0.93      0.93        45

混淆矩阵
             setosa  versicolor  virginica
setosa          14           0          0
versicolor       0          16          2
virginica        0           1         12

梯度增强机分类
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.94      0.94      0.94        18
           2       0.92      0.92      0.92        13

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45

混淆矩阵
             setosa  versicolor  virginica
setosa          14           0          0
versicolor       0          17          1
virginica        0           1         12

轻度增强机分类
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.94      0.94      0.94        18
           2       0.92      0.92      0.92        13

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45

混淆矩阵
             setosa  versicolor  virginica
setosa          14           0          0
versicolor       0          17          1
virginica        0           1         12

最近邻分类
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.94      0.94      0.94        18
           2       0.92      0.92      0.92        13

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45

混淆矩阵
             setosa  versicolor  virginica
setosa          14           0          0
versicolor       0          17          1
virginica        0           1         12

高斯过程分类
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.93      0.78      0.85        18
           2       0.75      0.92      0.83        13

    accuracy                           0.89        45
   macro avg       0.89      0.90      0.89        45
weighted avg       0.90      0.89      0.89        45

混淆矩阵
             setosa  versicolor  virginica
setosa          14           0          0
versicolor       0          14          4
virginica        0           1         12

高斯朴素贝叶斯分类
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.94      0.89      0.91        18
           2       0.86      0.92      0.89        13

    accuracy                           0.93        45
   macro avg       0.93      0.94      0.93        45
weighted avg       0.94      0.93      0.93        45

混淆矩阵
             setosa  versicolor  virginica
setosa          14           0          0
versicolor       0          16          2
virginica        0           1         12

决策树分类
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.94      0.94      0.94        18
           2       0.92      0.92      0.92        13

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45

混淆矩阵
             setosa  versicolor  virginica
setosa          14           0          0
versicolor       0          17          1
virginica        0           1         12

额外树分类
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.94      0.94      0.94        18
           2       0.92      0.92      0.92        13

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45

混淆矩阵
             setosa  versicolor  virginica
setosa          14           0          0
versicolor       0          17          1
virginica        0           1         12

随机森林分类
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.94      0.94      0.94        18
           2       0.92      0.92      0.92        13

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45

混淆矩阵
             setosa  versicolor  virginica
setosa          14           0          0
versicolor       0          17          1
virginica        0           1         12

神经网络分类
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      0.94      0.97        18
           2       0.93      1.00      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.98      0.98        45
weighted avg       0.98      0.98      0.98        45

混淆矩阵
             setosa  versicolor  virginica
setosa          14           0          0
versicolor       0          17          1
virginica        0           0         13

自适应增强分类
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.94      0.94      0.94        18
           2       0.92      0.92      0.92        13

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45

混淆矩阵
             setosa  versicolor  virginica
setosa          14           0          0
versicolor       0          17          1
virginica        0           1         12

极限梯度增强分类
               precision    recall  f1-score   support

           0       0.93      1.00      0.97        14
           1       0.94      0.89      0.91        18
           2       0.92      0.92      0.92        13

    accuracy                           0.93        45
   macro avg       0.93      0.94      0.93        45
weighted avg       0.93      0.93      0.93        45

混淆矩阵
             setosa  versicolor  virginica
setosa          14           0          0
versicolor       1          16          1
virginica        0           1         12
Last updated on 2025-06-21