0%

深度学习代码索引

本篇博客总结了,可能会重复使用的代码的索引,方便查找

机器学习

分类性能度量

精准率和召回率

1
2
3
4
5
6
7
8
9
10
# 混淆矩阵
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_log_predict)

# 精确率
from sklearn.metrics import precision_score
precision_score(y_test, y_log_predict)

from sklearn.metrics import recall_score
recall_score(y_test, y_log_predict)

P-R曲线绘制

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#利用鸢尾花数据集绘制P-R曲线
print(__doc__) #打印注释

import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm, datasets
from sklearn.metrics import precision_recall_curve # 精确率和召回率
from sklearn.metrics import average_precision_score
from sklearn.preprocessing import label_binarize # 标签的标准化处理
from sklearn.multiclass import OneVsRestClassifier #一对其余(每次将一个类作为正类,剩下的类作为负类)

# from sklearn.cross_validation import train_test_split #适用于anaconda 3.6及以前版本
from sklearn.model_selection import train_test_split #适用于anaconda 3.7,训练集和数据集拆分模块

#以iris数据为例,画出P-R曲线
iris = datasets.load_iris()
X = iris.data #150*4
y = iris.target #150*1

# 标签二值化,将三个类转为001, 010, 100的格式.因为这是个多类分类问题,后面将要采用
#OneVsRestClassifier策略转为二类分类问题
y = label_binarize(y, classes=[0, 1, 2]) #将150*1转化成150*3
n_classes = y.shape[1] #列的个数,等于3
print (y)

# 增加了800维的噪声特征
random_state = np.random.RandomState(0)
n_samples, n_features = X.shape

X = np.c_[X, random_state.randn(n_samples, 200 * n_features)] #行不变,只增加了列,150*804

# 训练集和测试集拆分,比例为0.5
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5, random_state=random_state) #随机数,填0或不填,每次都会不一样

# 一对其余,转换成两类,构建新的分类器
classifier = OneVsRestClassifier(svm.SVC(kernel='linear', probability=True, random_state=random_state))
#训练集送给fit函数进行拟合训练,训练完后将测试集的样本特征注入,得到测试集中每个样本预测的分数
y_score = classifier.fit(X_train, y_train).decision_function(X_test)

# Compute Precision-Recall and plot curve
#下面的下划线是返回的阈值。作为一个名称:此时“_”作为临时性的名称使用。
#表示分配了一个特定的名称,但是并不会在后面再次用到该名称。
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
#对于每一类,计算精确率和召回率的序列(:表示所有行,i表示第i列)
precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_score[:, i])
average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i])#切片,第i个类的分类结果性能

# Compute micro-average curve and area. ravel()将多维数组降为一维
precision["micro"], recall["micro"], _ = precision_recall_curve(y_test.ravel(), y_score.ravel())
average_precision["micro"] = average_precision_score(y_test, y_score, average="micro") #This score corresponds to the area under the precision-recall curve.

# Plot Precision-Recall curve for each class
plt.clf()#clf 函数用于清除当前图像窗口
plt.plot(recall["micro"], precision["micro"],
label='micro-average Precision-recall curve (area = {0:0.2f})'.format(average_precision["micro"]))
for i in range(n_classes):
plt.plot(recall[i], precision[i],
label='Precision-recall curve of class {0} (area = {1:0.2f})'.format(i, average_precision[i]))

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05]) #xlim、ylim:分别设置X、Y轴的显示范围。
plt.xlabel('Recall', fontsize=16)
plt.ylabel('Precision',fontsize=16)
plt.title('Extension of Precision-Recall curve to multi-class',fontsize=16)
plt.legend(loc="lower right")#legend 是用于设置图例的函数
plt.show()

特征处理

量纲缩放(标准化缩放法和区间缩放法)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# -*- coding: utf-8 -*-
"""
演示内容:量纲的特征缩放
(两种方法:标准化缩放法和区间缩放法。每种方法举了两个例子:简单二维矩阵和iris数据集)
"""
#方法1:标准化缩放法 例1:对简单示例二维矩阵的列数据进行
from sklearn import preprocessing
import numpy as np
#采用numpy的array表示,因为要用到其mean等函数,而list没有这些函数
X = np.array([[0, 0],
[0, 0],
[100, 1],
[1, 1]])
# calculate mean
X_mean = X.mean(axis=0)
# calculate variance
X_std = X.std(axis=0)
#print (X_std)
# standardize X
X1 = (X-X_mean)/X_std
print (X1)
print ("")

# we can also use function preprocessing.scale to standardize X
X_scale = preprocessing.scale(X)
print (X_scale)


#方法1: 标准化缩放法 例2:对iris数据二维矩阵的列数据进行。这次采用一个集成的方法StandardScaler
from sklearn import datasets
iris = datasets.load_iris()
X_scale = preprocessing.scale(iris.data)
print (X_scale)

#方法2: 区间缩放法 例3:对简单示例二维矩阵的列数据进行
from sklearn.preprocessing import MinMaxScaler

data = [[0, 0],
[0, 0],
[100, 1],
[1, 1]]

scaler = MinMaxScaler()
print(scaler.fit(data))
print(scaler.transform(data))

#方法2: 区间缩放法 例4:对iris数据二维矩阵的列数据进行
from sklearn.preprocessing import MinMaxScaler

data = iris.data

scaler = MinMaxScaler()
print(scaler.fit(data))
print(scaler.transform(data))

特征向量化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#特征向量化
#法一:布尔权重
#输入单词列表和句子列表
#返回特征向量
def boolCountVectorizer(wordlst,senlst):
wordCount = len(wordlst) # 总单词数,向量共wordCount列
senCount = len(senlst) # 总句子数,向量共senCount行
mat = np.zeros((senCount,wordCount), dtype=int)
word_index={}
for i in range(len(wordlst)): #提前记录每个单词在wordlst中的序号
word_index[str(wordlst[i])] = i
for i in range(senCount):
for j in range(len(senlst[i])):
if senlst[i][j] in wordlst:
idx = word_index[str(senlst[i][j])]
mat[i][idx] += 1

return mat

# mat=boolCountVectorizer(wordlst, senlst)
# print('mat.shape=',mat.shape)

#法二:tf-idf权重
def tfidfCountVectorizer(wordlst, senlst):

wordCount=len(wordlst)
senCount=len(senlst)
weight = np.zeros((senCount,wordCount), dtype=float)

n3 = senCount #n3表示句子总数
for j,word in enumerate(wordlst):
n4 = 0 #n4表示包含该word的句子数
for eve in senlst:
if word in eve:
n4 += 1
idf = math.log(((n3+1)/(n4+1))+1)

for i in range(senCount):
n1 = senlst[i].count(word)
n2 = len(senlst[i])
tf = n1/n2 #tf表示word在句子senlst[i]中的出现频率

weight[i][j]=tf*idf #weight[i][j]表示第j个词在i个句子中的tf-idf权重
return weight

# mat=tfidfCountVectorizer(wordlst,senlst)
# print(mat)
# print(mat.shape)
# print(np.sum(mat!=0.0))

特征选择

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
#特征选择
#法一:方差选择法
#输入特征向量mat,参数p,单词列表wordlst
#返回特征选择后的向量mat,特征单词wordlst
def featureSupport1(mat, p, wordlst):
wordlst = np.array(wordlst)

var = p * (1 - p)
lst = np.var(mat, axis=0) # 计算每列的方差
mu = lst >= var # mu得到一个bool矩阵
return mat[:,mu],wordlst[mu]

#法二:皮尔森相关系数法
def featureSupport2(mat,threshold,wordlst,y_train):
wordlst=np.array(wordlst)
y_train=np.array(y_train)

lst=[pearsonr(mat[:,i],y_train.T)[0] for i in range(mat.shape[1])]
mu=np.abs(np.array(lst))>=threshold
return mat[:,mu],wordlst[mu]

深度学习

自然语言处理

分词

1
2
3
4
5
6
7
8
9
10
11
12
13
14
#过滤掉标点
def get_tokens(text):
lowers = text.lower()
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
no_punctuation = lowers.translate(remove_punctuation_map)
tokens = nltk.word_tokenize(no_punctuation)
return tokens

#单词词形还原
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed