# 以下都是例行公事,直接拷贝即可
import pandas as pd
import numpy as np
# 导入matplotlib.pyplot绘图库,其中plt.plot()是最常用的绘图函数之一
import matplotlib.pyplot as plt
import seaborn as sns
# 默认用seaborn的绘图样式
sns.set_theme()
"font.sans-serif"]=["SimHei"] #设置字体。如果不设置,中文会乱码。这里采用黑体SimHei,也可以考虑其他字体,如宋体SimSun等
plt.rcParams["axes.unicode_minus"]=False #该语句解决图像中的“-”负号的乱码问题
plt.rcParams[
# 绘图使用'svg'后端:svg是矢量格式,可以任意缩放均保持清晰,各种屏幕的显示良好。
%config InlineBackend.figure_formats = ['svg']
23 机器学习分类算法
23.1 无监督学习
23.1.1 k-means
23.2
1000)
np.random.seed(=True, precision=4) np.set_printoptions(suppress
from sklearn.datasets import make_blobs
= make_blobs(n_samples=250, centers=4,
X, y =500, cluster_std=1.25)
random_state
=(8, 5))
plt.figure(figsize0], X[:, 1], s=20); plt.scatter(X[:,
from sklearn.cluster import KMeans
= KMeans(n_clusters=4, random_state=0)
model
model.fit(X) = model.predict(X) y_kmeans
=(8, 5))
plt.figure(figsize0], X[:, 1], c=y_kmeans, cmap='coolwarm'); plt.scatter(X[:,
23.3 有监督学习
from sklearn.datasets import make_classification
= 100
n_samples = make_classification(n_samples=n_samples, n_features=2,
X, y =2, n_redundant=0,
n_informative=0, random_state=250)
n_repeated
=(8, 5))
plt.figure(figsize=X[:, 0], y=X[:, 1], c=y, cmap='coolwarm'); plt.scatter(x
23.3.1 高斯朴素贝叶斯
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
= GaussianNB()
model model.fit(X, y)
GaussianNB()
round(4)[:5] model.predict_proba(X).
array([[0.0041, 0.9959],
[0.8534, 0.1466],
[0.9947, 0.0053],
[0.0182, 0.9818],
[0.5156, 0.4844]])
= model.predict(X)
pred print(pred[:5])
accuracy_score(y, pred)
[1 0 0 1 0]
0.87
= X[y == pred]
Xc = X[y != pred]
Xf
=(8, 5))
plt.figure(figsize
=Xc[:, 0], y=Xc[:, 1], c=y[y == pred],
plt.scatter(x='o', cmap='coolwarm')
marker=Xf[:, 0], y=Xf[:, 1], c=y[y != pred],
plt.scatter(x='x', cmap='coolwarm'); marker
23.3.2 逻辑回归
from sklearn.linear_model import LogisticRegression
= LogisticRegression(C=1, solver='lbfgs') model
model.fit(X,y)= model.predict(X)
pred accuracy_score(y, pred)
0.9
= X[y == pred]
Xc = X[y != pred]
Xf
=(8, 5))
plt.figure(figsize
=Xc[:, 0], y=Xc[:, 1], c=y[y == pred],
plt.scatter(x='o', cmap='coolwarm')
marker=Xf[:, 0], y=Xf[:, 1], c=y[y != pred],
plt.scatter(x='x', cmap='coolwarm'); marker
23.3.3 决策树
from sklearn.tree import DecisionTreeClassifier
= DecisionTreeClassifier(max_depth=4) model
model.fit(X, y) = model.predict(X)
pred accuracy_score(y, pred)
0.97
= X[y == pred]
Xc = X[y != pred]
Xf
=(8, 5))
plt.figure(figsize
=Xc[:, 0], y=Xc[:, 1], c=y[y == pred],
plt.scatter(x='o', cmap='coolwarm')
marker=Xf[:, 0], y=Xf[:, 1], c=y[y != pred],
plt.scatter(x='x', cmap='coolwarm'); marker
23.3.4 深度神经网络
from sklearn.neural_network import MLPClassifier
= MLPClassifier(solver='lbfgs', alpha=1e-5,
model =2 * [75], random_state=10) hidden_layer_sizes
%time model.fit(X, y)
CPU times: user 1.49 s, sys: 38.3 ms, total: 1.52 s
Wall time: 195 ms
/home/lee/anaconda3/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:549: ConvergenceWarning: lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
MLPClassifier(alpha=1e-05, hidden_layer_sizes=[75, 75], random_state=10,
solver='lbfgs')
= model.predict(X)
pred accuracy_score(y, pred)
1.0
23.3.5 支持向量机
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
= train_test_split(X, y, test_size=0.33, random_state=0)
train_x, test_x, train_y, test_y
= SVC(C=1, kernel='linear') model
model.fit(train_x, train_y)
SVC(C=1, kernel='linear')
= model.predict(train_x) pred_train
accuracy_score(train_y, pred_train)
0.9402985074626866