机器学习-分类问题

将特定类别出现的概率与该类别出现时每一个特征值出现的概率取乘积,以此表示该组特征值被归属为该类别的概率。以此计算该组特征值被归属为每一个类别的概率,择其最大的概率所对应的类别作为预测结果。

简单分类

1
2
3
4
5
6
7
8
9
10
x1 x2  ->  y
3 1 0
2 5 1
1 8 1
6 4 0
5 2 0
3 5 1
4 7 1
4 1 0
3 9 ? -> 1

$x_1 > x_2: y = 0$
$x_1 < x_2: y = 1$

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import numpy as np
import matplotlib.pyplot as mp
x = np.array([
[3, 1],
[2, 5],
[1, 8],
[6, 4],
[5, 2],
[3, 5],
[4, 7],
[4,-1]])
y = np.array([0, 1, 1, 0, 0, 1, 1, 0])
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = np.zeros(len(flat_x), dtype=int)
flat_y[flat_x[:, 0] < flat_x[:, 1]] = 1 # 分类规则
grid_y = flat_y.reshape(grid_x[0].shape)
mp.figure('Simple Classification',
facecolor='lightgray')
mp.title('Simple Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
mp.scatter(x[:, 0], x[:, 1], c=y, cmap='brg', s=80)
mp.show()

逻辑回归分类器

$y = w0 + w1x$

$z = \frac{1}{1 + e^{-y}}$

x样本被归属为1类别的概率

梯度下降
model=LogisticRegression(solver=’liblinear’, C=正则强度)
$w0 + w1 \times 1 + w2 \times 2 = 0$

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
``` py
import numpy as np
import sklearn.linear_model as lm
import matplotlib.pyplot as mp
x = np.array([
[3, 1],
[2, 5],
[1, 8],
[6, 4],
[5, 2],
[3, 5],
[4, 7],
[4,-1]])
y = np.array([0, 1, 1, 0, 0, 1, 1, 0])
# 逻辑回归分类器
model = lm.LogisticRegression(
solver='liblinear', C=1)
model.fit(x, y)
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
mp.figure('Logistic Classification',
facecolor='lightgray')
mp.title('Logistic Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
mp.scatter(x[:, 0], x[:, 1], c=y, cmap='brg', s=80)
mp.show()
1
2
3
4
5
6
7
xxxx A 1    0     0
xxxx B 0 1 0
xxxx C 0 0 1
A B C
........ 0.2 0.6 0.4 B
........ 0.6 0.1 0.3 A
........ 0.4 0.2 0.8 C
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import numpy as np
import sklearn.linear_model as lm
import matplotlib.pyplot as mp
x = np.array([
[4, 7],
[3.5, 8],
[3.1, 6.2],
[0.5, 1],
[1, 2],
[1.2, 1.9],
[6, 2],
[5.7,1.5],
[5.4, 2.2]])
y = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
# 逻辑回归分类器
model = lm.LogisticRegression(
solver='liblinear', C=1000)
model.fit(x, y)
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
mp.figure('Logistic Classification',
facecolor='lightgray')
mp.title('Logistic Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
mp.scatter(x[:, 0], x[:, 1], c=y, cmap='brg', s=80)
mp.show()

朴素贝叶斯分类

1
2
3
4
5
6
7
8
9
10
11
12
13
1 2 3 -> 0
4 5 6 -> 1
7 8 9 -> 2
...
1 5 9 -> ? 0 0.7
1 0.5
2 0.8 *
P(x1,x2,x3,C)
=P(x1|x2,x3,C)P(x2,x3,C)
=P(x1|x2,x3,C)P(x2|x3,C)P(x3,C)
=P(x1|x2,x3,C)P(x2|x3,C)P(x3|C)P(C)
朴素:条件独立,特征值之间没有约束性。
=P(x1|C)P(x2|C)P(x3|C)P(C)

将特定类别出现的概率与该类别出现时每一个特征值出现的概率取乘积,以此表示该组特征值被归属为该类别的概率。以此计算该组特征值被归属为每一个类别的概率,择其最大的概率所对应的类别作为预测结果。
关于某个特征值在特定类别出现时的概率,可以通过事先已知的概率密度函数或概率质量函数计算得到。
体现历史数据所表现出的统计规律,同时不存在对分类边界的线性约束,但是对于统计规则不明且样本数量较少的场合不适用。
import sklearn.naive_bayes as nb
model = nb.GaussianNB()
基于高斯分布即正态分布的朴素贝叶斯分类器

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import numpy as np
import sklearn.naive_bayes as nb
import matplotlib.pyplot as mp
x, y = [], []
with open('../data/multiple1.txt') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y)
# 朴素贝叶斯分类器
model = nb.GaussianNB()
model.fit(x, y)
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
mp.figure('NB Classification',
facecolor='lightgray')
mp.title('NB Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y, cmap='gray')
mp.scatter(x[:, 0], x[:, 1], c=y, cmap='brg', s=80)
mp.show()

1. 划分训练集和测试集

import sklearn.model_selection as ms;
ms.train_test_split(
输入集, 输出集, test_size=测试集比例,
random_state=随机种子)->
训练输入集,测试输入集,训练输出集,测试输出集
该函数对每一个类别单独洗牌,然后根据test_size参数提取相应比例的样本用于测试,其余样本用于训练,各个类别的分布比例在训练集和测试集中都是一致的。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import numpy as np
import sklearn.model_selection as ms
import sklearn.naive_bayes as nb
import matplotlib.pyplot as mp
x, y = [], []
with open('../data/multiple1.txt') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y)
# 按3:1的比例划分训练集和测试集
train_x, test_x, train_y, test_y = \
ms.train_test_split(x, y, test_size=0.25,
random_state=7)
# 朴素贝叶斯分类器
model = nb.GaussianNB()
model.fit(train_x, train_y)
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
pred_test_y = model.predict(test_x)
print((pred_test_y == test_y).sum() /
pred_test_y.size)
mp.figure('NB Classification',
facecolor='lightgray')
mp.title('NB Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y,
cmap='gray')
mp.scatter(test_x[:, 0], test_x[:, 1],
c=test_y, cmap='brg', s=80)
mp.show()

2. 评价分类器的性能

查准率:对于某个类别,找对的/找出来的,正确性。
召回率:对于某个类别,找对的/该类别样本数,完整性。
F1得分:2x查准率x召回率/(查准率+召回率)

3. 交叉验证

ms.cross_val_score(模型对象, 输入集, 输出集,
cv=验证次数, scoring=评价指标)->
每次验证测试集各个类别指标的平均值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
    import numpy as np
import sklearn.model_selection as ms
import sklearn.naive_bayes as nb
import matplotlib.pyplot as mp
x, y = [], []
with open('../data/multiple1.txt') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y)
# 按3:1的比例划分训练集和测试集
train_x, test_x, train_y, test_y = \
ms.train_test_split(x, y, test_size=0.25,
random_state=7)
# 朴素贝叶斯分类器
model = nb.GaussianNB()
# 交叉验证
pw = ms.cross_val_score(model, train_x,
train_y, cv=5, scoring='precision_weighted')
print(pw.mean())
rw = ms.cross_val_score(model, train_x,
train_y, cv=5, scoring='recall_weighted')
print(rw.mean())
fw = ms.cross_val_score(model, train_x,
train_y, cv=5, scoring='f1_weighted')
print(fw.mean())
model.fit(train_x, train_y)
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
pred_test_y = model.predict(test_x)
print((pred_test_y == test_y).sum() /
pred_test_y.size)
mp.figure('NB Classification',
facecolor='lightgray')
mp.title('NB Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y,
cmap='gray')
mp.scatter(test_x[:, 0], test_x[:, 1],
c=test_y, cmap='brg', s=80)
mp.show()

4. 混淆矩阵

import sklearn.metrics as sm
sm.confusion_matrix(实际输出, 预测输出)->混淆矩阵
每一行对应一个实际类别
每一列对应一个预测类别
对角线上元素表示各个类别分类正确的样本数,其它位置的元素表示分类误差。
对角线上元素/其所在列元素之和 = 该类别的查准率 \ F1得分
对角线上元素/其所在行元素之和 = 该类别的召回率 /

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
    import numpy as np
import sklearn.model_selection as ms
import sklearn.naive_bayes as nb
import sklearn.metrics as sm
import matplotlib.pyplot as mp
x, y = [], []
with open('../data/multiple1.txt') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y)
# 按3:1的比例划分训练集和测试集
train_x, test_x, train_y, test_y = \
ms.train_test_split(x, y, test_size=0.25,
random_state=7)
# 朴素贝叶斯分类器
model = nb.GaussianNB()
# 交叉验证
pw = ms.cross_val_score(model, train_x,
train_y, cv=5, scoring='precision_weighted')
print(pw.mean())
rw = ms.cross_val_score(model, train_x,
train_y, cv=5, scoring='recall_weighted')
print(rw.mean())
fw = ms.cross_val_score(model, train_x,
train_y, cv=5, scoring='f1_weighted')
print(fw.mean())
model.fit(train_x, train_y)
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
pred_test_y = model.predict(test_x)
print((pred_test_y == test_y).sum() /
pred_test_y.size)
# 混淆矩阵
cm = sm.confusion_matrix(test_y, pred_test_y)
print(cm)
mp.figure('NB Classification',
facecolor='lightgray')
mp.title('NB Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y,
cmap='gray')
mp.scatter(test_x[:, 0], test_x[:, 1],
c=test_y, cmap='brg', s=80)
mp.show()

5. 分类报告

针对每一个类别的评价指标及平均值。
sm.classification_report(实际输出, 预测输出)->分类报告

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import numpy as np
import sklearn.model_selection as ms
import sklearn.naive_bayes as nb
import sklearn.metrics as sm
import matplotlib.pyplot as mp
x, y = [], []
with open('../data/multiple1.txt') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y)
# 按3:1的比例划分训练集和测试集
train_x, test_x, train_y, test_y = \
ms.train_test_split(x, y, test_size=0.25,
random_state=7)
# 朴素贝叶斯分类器
model = nb.GaussianNB()
# 交叉验证
pw = ms.cross_val_score(model, train_x,
train_y, cv=5, scoring='precision_weighted')
print(pw.mean())
rw = ms.cross_val_score(model, train_x,
train_y, cv=5, scoring='recall_weighted')
print(rw.mean())
fw = ms.cross_val_score(model, train_x,
train_y, cv=5, scoring='f1_weighted')
print(fw.mean())
model.fit(train_x, train_y)
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
pred_test_y = model.predict(test_x)
print((pred_test_y == test_y).sum() /
pred_test_y.size)
# 混淆矩阵
cm = sm.confusion_matrix(test_y, pred_test_y)
print(cm)
# 分类报告
cr = sm.classification_report(test_y, pred_test_y)
print(cr)
mp.figure('NB Classification',
facecolor='lightgray')
mp.title('NB Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y,
cmap='gray')
mp.scatter(test_x[:, 0], test_x[:, 1],
c=test_y, cmap='brg', s=80)
mp.show()

十一、决策树分类

投票
基于随机森林分类器的汽车品质评估

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import numpy as np
import sklearn.preprocessing as sp
import sklearn.ensemble as se
import sklearn.model_selection as ms
data = []
with open('../data/car.txt') as f:
for line in f.readlines():
data.append(line[:-1].split(','))
data = np.array(data).T
encoders, train_x = [], []
for row in range(len(data)):
encoder = sp.LabelEncoder()
if row < len(data) - 1:
train_x.append(
encoder.fit_transform(data[row]))
else:
train_y = encoder.fit_transform(data[row])
encoders.append(encoder)
train_x = np.array(train_x).T
# 随机森林分类器
model = se.RandomForestClassifier(
max_depth=8, n_estimators=200,
random_state=7)
print(ms.cross_val_score(model, train_x,
train_y, cv=3, scoring='f1_weighted').mean())
model.fit(train_x, train_y)
data = [
['high', 'med', '5more', '4', 'big', 'low'],
['high', 'high', '4', '4', 'med', 'med'],
['low', 'low', '2', '4', 'small', 'high'],
['low', 'med', '3', '4', 'med', 'high']]
data = np.array(data).T
test_x = []
for row in range(len(data)):
encoder = encoders[row]
test_x.append(encoder.transform(data[row]))
test_x = np.array(test_x).T
pred_test_y = model.predict(test_x)
print(encoders[-1].inverse_transform(pred_test_y))

1. 验证曲线

模型的交叉验证得分=f(模型超参数)
ms.validation_curve(
模型对象, 输入集, 输出集, 超参数名, 超参数表, cv=验证次数)
->训练集得分矩阵, 测试集得分矩阵
验证1 验证2 …
超参数取值1 -> 平均
超参数取值2

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import numpy as np
import sklearn.preprocessing as sp
import sklearn.ensemble as se
import sklearn.model_selection as ms
import matplotlib.pyplot as mp
data = []
with open('../data/car.txt') as f:
for line in f.readlines():
data.append(line[:-1].split(','))
data = np.array(data).T
x = []
for row in range(len(data)):
encoder = sp.LabelEncoder()
if row < len(data) - 1:
x.append(
encoder.fit_transform(data[row]))
else:
y = encoder.fit_transform(data[row])
x = np.array(x).T
# 随机森林分类器
model = se.RandomForestClassifier(
max_depth=8, random_state=7)
n_estimators = np.arange(50, 550, 50)
train_scores1, test_scores1 = \
ms.validation_curve(model, x, y,
'n_estimators', n_estimators, cv=5)
train_means1 = train_scores1.mean(axis=1)
model = se.RandomForestClassifier(
n_estimators=150, random_state=7)
max_depth = np.arange(1, 11)
train_scores2, test_scores2 = \
ms.validation_curve(model, x, y,
'max_depth', max_depth, cv=5)
train_means2 = train_scores2.mean(axis=1)
mp.figure('n_estimators', facecolor='lightgray')
mp.title('n_estimators', fontsize=20)
mp.xlabel('n_estimators', fontsize=14)
mp.ylabel('F1 Score', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.plot(n_estimators, train_means1, 'o-',
c='dodgerblue', label='Training')
mp.legend()
mp.tight_layout()
mp.figure('max_depth', facecolor='lightgray')
mp.title('max_depth', fontsize=20)
mp.xlabel('max_depth', fontsize=14)
mp.ylabel('F1 Score', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.plot(max_depth, train_means2, 'o-',
c='limegreen', label='Training')
mp.legend()
mp.tight_layout()
mp.show()

2. 学习曲线

模型的交叉验证得分=f(训练集大小)
ms.learning_curve(
模型对象, 输入集, 输出集, 训练集大小列表, cv=验证次数)
->训练集大小列表, 训练集得分矩阵, 测试集得分矩阵

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import numpy as np
import sklearn.preprocessing as sp
import sklearn.ensemble as se
import sklearn.model_selection as ms
import matplotlib.pyplot as mp
data = []
with open('../data/car.txt') as f:
for line in f.readlines():
data.append(line[:-1].split(','))
data = np.array(data).T
x = []
for row in range(len(data)):
encoder = sp.LabelEncoder()
if row < len(data) - 1:
x.append(
encoder.fit_transform(data[row]))
else:
y = encoder.fit_transform(data[row])
x = np.array(x).T
# 随机森林分类器
model = se.RandomForestClassifier(
max_depth=8, n_estimators=150,
random_state=7)
train_sizes = np.linspace(0.1, 1, 10)
print(train_sizes)
train_sizes, train_scores, test_scores = \
ms.learning_curve(model, x, y,
train_sizes=train_sizes, cv=5)
train_means = train_scores.mean(axis=1)
test_means = test_scores.mean(axis=1)
mp.figure('Leaning Curve', facecolor='lightgray')
mp.title('Leaning Curve', fontsize=20)
mp.xlabel('train_size', fontsize=14)
mp.ylabel('F1 Score', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.plot(train_sizes, train_means, 'o-',
c='dodgerblue', label='Training')
mp.plot(train_sizes, test_means, 'o-',
c='orangered', label='Testing')
mp.legend()
mp.tight_layout()
mp.show()

十二、支持向量机

1. 原理

寻求最优的分类边界,即被支持向量所夹持的分类带宽度达到最大值,取其中心线作为分类边界。
* 安全性:分类带最宽
* 公平性:分类带中心线
* 简单性:线性边界,分割超平面
* 对于在原始维度空间中无法线性分割的样本,通过特定的核函数增加特征,即升高维度,在高维度空间寻求分割超平面。

2. 接口

import sklearn.svm as svm
分类器模型=svm.SVC(kernel=核函数类型, …)
回归器模型=svm.SVR(kernel=核函数类型, …)

3. 线性核函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import numpy as np
import sklearn.model_selection as ms
import sklearn.svm as svm
import sklearn.metrics as sm
import matplotlib.pyplot as mp
x, y = [], []
with open('../data/multiple2.txt') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y)
train_x, test_x, train_y, test_y = \
ms.train_test_split(x, y, test_size=0.25,
random_state=7)
# 支持向量机分类器
model = svm.SVC(kernel='linear')
model.fit(train_x, train_y)
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
pred_test_y = model.predict(test_x)
print(sm.classification_report(test_y, pred_test_y))
mp.figure('SVM Classification',
facecolor='lightgray')
mp.title('SVM Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y,
cmap='gray')
C0, C1 = y == 0, y == 1
mp.scatter(x[C0][:, 0], x[C0][:, 1],
c='orangered', s=80)
mp.scatter(x[C1][:, 0], x[C1][:, 1],
c='limegreen', s=80)
mp.show()

4. 多项式核函数

x1 x2
x1 x2 x1^2 x2^2 x1x2 x1^2x2 x1x2^2 x1^3 x2^3

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import numpy as np
import sklearn.model_selection as ms
import sklearn.svm as svm
import sklearn.metrics as sm
import matplotlib.pyplot as mp
x, y = [], []
with open('../data/multiple2.txt') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y)
train_x, test_x, train_y, test_y = \
ms.train_test_split(x, y, test_size=0.25,
random_state=7)
# 支持向量机分类器
model = svm.SVC(kernel='poly', degree=3)
model.fit(train_x, train_y)
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
pred_test_y = model.predict(test_x)
print(sm.classification_report(test_y, pred_test_y))
mp.figure('SVM Classification',
facecolor='lightgray')
mp.title('SVM Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y,
cmap='gray')
C0, C1 = y == 0, y == 1
mp.scatter(x[C0][:, 0], x[C0][:, 1], c='orangered',
s=80)
mp.scatter(x[C1][:, 0], x[C1][:, 1], c='limegreen',
s=80)
mp.show()

5. 径向基核函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import numpy as np
import sklearn.model_selection as ms
import sklearn.svm as svm
import sklearn.metrics as sm
import matplotlib.pyplot as mp
x, y = [], []
with open('../data/multiple2.txt') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y)
train_x, test_x, train_y, test_y = \
ms.train_test_split(x, y, test_size=0.25,
random_state=7)
# 支持向量机分类器
model = svm.SVC(kernel='rbf', gamma=0.01, C=600)
model.fit(train_x, train_y)
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
pred_test_y = model.predict(test_x)
print(sm.classification_report(test_y, pred_test_y))
mp.figure('SVM Classification',
facecolor='lightgray')
mp.title('SVM Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y,
cmap='gray')
C0, C1 = y == 0, y == 1
mp.scatter(x[C0][:, 0], x[C0][:, 1], c='orangered',
s=80)
mp.scatter(x[C1][:, 0], x[C1][:, 1], c='limegreen',
s=80)
mp.show()

6. 样本均衡

svm.SVC(…, class_weight=’balanced’)
通过为不同类别的样本设置不同的权重,平衡比例相差较大的类别为分类器所带来的不同影响。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import numpy as np
import sklearn.model_selection as ms
import sklearn.svm as svm
import sklearn.metrics as sm
import matplotlib.pyplot as mp
x, y = [], []
with open('../data/imbalance.txt') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y)
train_x, test_x, train_y, test_y = \
ms.train_test_split(x, y, test_size=0.25,
random_state=7)
# 支持向量机分类器
model = svm.SVC(kernel='linear',
class_weight='balanced')
model.fit(train_x, train_y)
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
pred_test_y = model.predict(test_x)
print(sm.classification_report(test_y, pred_test_y))
mp.figure('SVM Classification',
facecolor='lightgray')
mp.title('SVM Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y,
cmap='gray')
C0, C1 = y == 0, y == 1
mp.scatter(x[C0][:, 0], x[C0][:, 1],
c='orangered', s=80)
mp.scatter(x[C1][:, 0], x[C1][:, 1],
c='limegreen', s=80)
mp.show()

7. 置信概率

根据样本与分类边界的距离表示该样本被分类器归属每个类别的概率。
model = svm.SVC(…, probablity=True, …)
model.predict_proba(输入集)->概率矩阵
概率矩阵中的一行对应输入集中的要给样本,一列表示一个的类别,而其中的值表示该样本被归属为特定类别的概率。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import numpy as np
import sklearn.model_selection as ms
import sklearn.svm as svm
import sklearn.metrics as sm
import matplotlib.pyplot as mp
x, y = [], []
with open('../data/multiple2.txt') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y)
train_x, test_x, train_y, test_y = \
ms.train_test_split(x, y, test_size=0.25,
random_state=7)
# 支持向量机分类器
model = svm.SVC(kernel='rbf', gamma=0.01, C=600,
probability=True)
model.fit(train_x, train_y)
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
pred_test_y = model.predict(test_x)
print(sm.classification_report(test_y, pred_test_y))
prob_x = np.array([
[2, 1.5],
[8, 9],
[4.8, 5.2],
[4, 4],
[2.5, 7],
[7.6, 2],
[5.4, 5.9]])
print(prob_x)
pred_prob_y = model.predict(prob_x)
print(pred_prob_y)
probs = model.predict_proba(prob_x)
print(probs)
mp.figure('SVM Classification',
facecolor='lightgray')
mp.title('SVM Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y,
cmap='gray')
C0, C1 = y == 0, y == 1
mp.scatter(x[C0][:, 0], x[C0][:, 1], c='orangered',
s=80)
mp.scatter(x[C1][:, 0], x[C1][:, 1], c='limegreen',
s=80)
mp.show()

8. 网格搜索

超参数组合列表:
[{超参数名: [取值列表], 超参数名: [取值列表]}, {…}, …]
model = ms.GridSearchCV(
基本模型, 超参数组合列表, cv=交叉验证数)
model.fit(输入集, 输出集)
model -> 用最优超参数组合设置的模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import numpy as np
import sklearn.model_selection as ms
import sklearn.svm as svm
import sklearn.metrics as sm
import matplotlib.pyplot as mp
x, y = [], []
with open('../data/multiple2.txt') as f:
for line in f.readlines():
data = [float(substr) for substr
in line.split(',')]
x.append(data[:-1])
y.append(data[-1])
x = np.array(x)
y = np.array(y)
train_x, test_x, train_y, test_y = \
ms.train_test_split(x, y, test_size=0.25,
random_state=7)
# 超参数组合列表
params = [
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]},
{'kernel': ['poly'], 'C': [1], 'degree': [2, 3]},
{'kernel': ['rbf'], 'C': [1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001]}]
model = ms.GridSearchCV(
svm.SVC(probability=True), params, cv=5);
model.fit(train_x, train_y)
for param, score in zip(
model.cv_results_['params'],
model.cv_results_['mean_test_score']):
print(param, score)
print('-' * 20)
print(model.best_params_)
l, r, h = x[:, 0].min() - 1, x[:, 0].max() + 1, 0.005
b, t, v = x[:, 1].min() - 1, x[:, 1].max() + 1, 0.005
grid_x = np.meshgrid(np.arange(l, r, h),
np.arange(b, t, v))
flat_x = np.c_[grid_x[0].ravel(), grid_x[1].ravel()]
flat_y = model.predict(flat_x)
grid_y = flat_y.reshape(grid_x[0].shape)
pred_test_y = model.predict(test_x)
print(sm.classification_report(test_y, pred_test_y))
prob_x = np.array([
[2, 1.5],
[8, 9],
[4.8, 5.2],
[4, 4],
[2.5, 7],
[7.6, 2],
[5.4, 5.9]])
print(prob_x)
pred_prob_y = model.predict(prob_x)
print(pred_prob_y)
probs = model.predict_proba(prob_x)
print(probs)
mp.figure('SVM Classification',
facecolor='lightgray')
mp.title('SVM Classification', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.pcolormesh(grid_x[0], grid_x[1], grid_y,
cmap='gray')
C0, C1 = y == 0, y == 1
mp.scatter(x[C0][:, 0], x[C0][:, 1], c='orangered',
s=80)
mp.scatter(x[C1][:, 0], x[C1][:, 1], c='limegreen',
s=80)
C0, C1 = pred_prob_y == 0, pred_prob_y == 1;
mp.scatter(prob_x[C0][:, 0], prob_x[C0][:, 1],
marker='D', c='dodgerblue', s=70)
mp.scatter(prob_x[C1][:, 0], prob_x[C1][:, 1],
marker='D', c='deeppink', s=70)
for i in range(len(probs[C0])):
mp.annotate('{}% {}%'.format(
round(probs[C0][:, 0][i] * 100, 2),
round(probs[C0][:, 1][i] * 100, 2)),
xy=(prob_x[C0][:, 0][i],
prob_x[C0][:, 1][i]),
xytext=(12, -12),
textcoords='offset points',
horizontalalignment='left',
verticalalignment='top',
fontsize=9,
bbox={'boxstyle': 'round,pad=0.6',
'fc': 'deepskyblue',
'alpha': 0.8})
for i in range(len(probs[C1])):
mp.annotate('{}% {}%'.format(
round(probs[C1][:, 0][i] * 100, 2),
round(probs[C1][:, 1][i] * 100, 2)),
xy=(prob_x[C1][:, 0][i],
prob_x[C1][:, 1][i]),
xytext=(12, -12),
textcoords='offset points',
horizontalalignment='left',
verticalalignment='top',
fontsize=9,
bbox={'boxstyle': 'round,pad=0.6',
'fc': 'violet',
'alpha': 0.8})
mp.show()

事件预测

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import numpy as np
import sklearn.preprocessing as sp
import sklearn.model_selection as ms
import sklearn.svm as svm
class DigitEncoder():
def fit_transform(self, x):
return x.astype(int)
def transform(self, x):
return x.astype(int)
def inverse_transform(self, x):
return x.astype(str)
data = []
with open("../data/event.txt", 'r') as f:
for line in f.readlines():
data.append(line[:-1].split(','))
data = np.delete(np.array(data).T, 1, 0)
encoders, x = [], []
for row in range(len(data)):
if data[row, 0].isdigit():
encoder = DigitEncoder()
else:
encoder = sp.LabelEncoder()
if row < len(data) - 1:
x.append(encoder.fit_transform(
data[row]))
else:
y = encoder.fit_transform(data[row])
encoders.append(encoder)
x = np.array(x).T
train_x, test_x, train_y, test_y = \
ms.train_test_split(x, y, test_size=0.25,
random_state=5)
model = svm.SVC(kernel='rbf',
class_weight='balanced')
print(ms.cross_val_score(model, train_x,
train_y, cv=5, scoring='accuracy').mean())
model.fit(train_x, train_y)
pred_test_y = model.predict(test_x)
print((pred_test_y == test_y).sum() /
pred_test_y.size)
data = [['Tuesday', '12:30:00', '21', '23']]
data = np.array(data).T
x = []
for row in range(len(data)):
encoder = encoders[row]
x.append(encoder.transform(data[row]))
x = np.array(x).T
pred_y = model.predict(x)
print(encoders[-1].inverse_transform(pred_y))