1. Anomaly detection

内容：使用高斯模型来检测数据集中异常的数据（概率低的），先在2维数据中进行实验。样本具有两个特征：a. 服务器响应的吞吐量（mb/s） b. 延迟（ms）。

数据可视化

main.py

from scipy.io import loadmat
import matplotlib.pyplot as plt

X = loadmat('ex8data1.mat')['X']
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(X[:, 0], X[:, 1])
ax.set_xlabel('Latency(ms)')
ax.set_ylabel('Throughput(mb/s)')
plt.show()

1.1 Gaussian distribution

内容：对每个特征 $x_{i}$ 都拟合一个高斯分布。

1.2 Estimating parameters for a Gaussian

内容：计算高斯分布函数的参数。输入一个X矩阵，得出一个包含n个特征的平均值mean和包含n个特征的sigma2。

estimateGaussian.py

def estimateGaussian(X):
    mean = X.mean(axis=0)
    sigma2 = X.var(axis=0)
    return mean, sigma2

main.py

from scipy.io import loadmat
from estimateGaussian import *  # 估算高斯分布的参数

X = loadmat('ex8data1.mat')['X']
mean, sigma2 = estimateGaussian(X)
print(mean, sigma2)

[14.11222578 14.99771051] [1.83263141 1.70974533]

数据可视化：

np.meshgrid为生成网络点坐标，如：

xplot = np.linspace(0, 25, 100)
yplot = np.linspace(0, 25, 100)
Xplot, Yplot = np.meshgrid(xplot, yplot)  # 生成网格点坐标
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(Xplot, Yplot, s=5)
plt.show()

gaussianDistribution.py

import numpy as np

def gaussianDistribution(X, mean, sigma2):
    sigma2 = np.reshape(sigma2, (1, 2))
    mean = np.reshape(mean, (1, 2))
    p = (1 / (np.sqrt(2 * np.pi) * sigma2)) * (np.exp(-(X - mean) ** 2 / (2 * sigma2)))
    # prod为累乘
    return np.prod(p, axis=1)

main.py

from scipy.io import loadmat
import numpy as np
import matplotlib.pyplot as plt
from estimateGaussian import *  # 估算高斯分布的参数
from gaussianDistribution import *  # 计算高斯分布函数

X = loadmat('ex8data1.mat')['X']
fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(X[:, 0], X[:, 1])
mean, sigma2 = estimateGaussian(X)
gaussianDistribution(X, mean, sigma2)
xplot = np.linspace(0, 25, 100)
yplot = np.linspace(0, 25, 100)
Xplot, Yplot = np.meshgrid(xplot, yplot)  # 生成网格点坐标

# 1.concatenate用于拼接数组
# 2.reshape(-1,1) 转换成1列
X = np.concatenate((Xplot.reshape(-1, 1), Yplot.reshape(-1, 1)), axis=1)
p = gaussianDistribution(X, mean, sigma2).reshape(Xplot.shape)
# 3.绘制等高线
# contour(X,Y,Z,[levels]) levels:确定轮廓线的数量和位置
contour = plt.contour(Xplot, Yplot, p, [10 ** -11, 10 ** -7, 10 ** -5, 10 ** -3, 0.1])
ax.set_xlabel('Latency(ms)')
ax.set_ylabel('Throughput(mb/s)')
plt.show()

1.3 Selecting the threshold, ε

内容：根据交叉验证集，使用 F1-score 来选择合适的阈值ε。低概率的数据可能是异常的。

精确率（precision）召回率（recall）

$F_{1}=\frac{2\cdot prec\cdot rec}{prec+rec}$ $prec=\frac{tp}{tp+fp} \$ $rec=\frac{tp}{tp+fn}$

selectThreshold.py

import numpy as np

def selectThreshold(pval, yval):
    best_epsilon = 0
    best_f1 = 0
    f1 = 0
    step = (pval.max() - pval.min()) / 1000
    # 1.np.arange(start,end,step):step为步长
    # 2.np.logical_and(A,B) 返回A和B与逻辑后的布尔值
    for epsilon in np.arange(pval.min(), pval.max(), step):
        predicts = pval < epsilon  # anomaly
        # 虽然predicts(多一列)与yval列数不同，但是可以按照predicts的第一行与yval的第一行进行比较。
        tp = np.sum(np.logical_and(predicts == 1, yval == 1)).astype(float)
        fp = np.sum(np.logical_and(predicts == 1, yval == 0)).astype(float)
        fn = np.sum(np.logical_and(predicts == 0, yval == 1)).astype(float)
        prec = tp / (tp + fp) if tp + fp else 0
        rec = tp / (tp + fn) if tp + fn else 0
        f1 = (2 * prec * rec) / (prec + rec) if prec + rec else 0
        if f1 > best_f1:
            best_f1 = f1
            best_epsilon = epsilon
    return best_epsilon, best_f1

main.py

from scipy.io import loadmat
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
from estimateGaussian import *  # 估算高斯分布的参数
from gaussianDistribution import *  # 计算高斯分布
from selectThreshold import *  # 选择阈值

raw_data = loadmat('ex8data1.mat')
X, Xval, yval = raw_data['X'], raw_data['Xval'], raw_data['yval']
mean, sigma2 = estimateGaussian(X)
# print(Xval.shape, yval.shape) (307, 2) (307, 1)
# 1.stats.norm(mean,sigma2).pdf:概率密度函数
p = np.zeros(X.shape)
p[:, 0] = stats.norm(mean[0], sigma2[0]).pdf(X[:, 0])
p[:, 1] = stats.norm(mean[1], sigma2[1]).pdf(X[:, 1])
# print(p.shape)  # (307, 2)
# 2.验证集在相同模型参数下计算概率
pval = np.zeros(Xval.shape)
pval[:, 0] = stats.norm(mean[0], sigma2[0]).pdf(Xval[:, 0])
pval[:, 1] = stats.norm(mean[1], sigma2[1]).pdf(Xval[:, 1])
# print(pval.shape)  # (307, 2)

epsilon, f1 = selectThreshold(pval, yval)
print(epsilon, f1)

0.009566706005956842 0.7142857142857143

可视化结果

main.py

from scipy.io import loadmat
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
from estimateGaussian import *  # 估算高斯分布的参数
from gaussianDistribution import *  # 计算高斯分布
from selectThreshold import *  # 选择阈值

raw_data = loadmat('ex8data1.mat')
X, Xval, yval = raw_data['X'], raw_data['Xval'], raw_data['yval']
mean, sigma2 = estimateGaussian(X)
# print(Xval.shape, yval.shape) (307, 2) (307, 1)
# 1.stats.norm(mean,sigma2).pdf:概率密度函数
p = np.zeros(X.shape)
p[:, 0] = stats.norm(mean[0], sigma2[0]).pdf(X[:, 0])
p[:, 1] = stats.norm(mean[1], sigma2[1]).pdf(X[:, 1])
# print(p.shape)  # (307, 2)
# 2.验证集在相同模型参数下计算概率
pval = np.zeros(Xval.shape)
pval[:, 0] = stats.norm(mean[0], sigma2[0]).pdf(Xval[:, 0])
pval[:, 1] = stats.norm(mean[1], sigma2[1]).pdf(Xval[:, 1])
# print(pval.shape)  # (307, 2)

epsilon, f1 = selectThreshold(pval, yval)
# np.where(A)返回符合条件元素的坐标
outliers = np.where(p < epsilon)[0]
# print(outliers)  # [300 301 301 303 303 304 306 306]

fig, ax = plt.subplots(figsize=(8, 6))
ax.scatter(X[:, 0], X[:, 1])
ax.scatter(X[outliers, 0], X[outliers, 1], c='r', marker='o')
ax.set_xlabel('Latency(ms)')
ax.set_ylabel('Throughput(mb/s)')
plt.show()

2. Recommender Systems

内容：该推荐系统将使用协同过滤算法在电影评级上进行应用。

2.1 Movie ratings dataset

内容：

n_movies：电影数量；n_users：用户数量

R(i,j)：值为1表示用户 j 对电影 i 评分过，0表示未评分

y(i,j)：评分（1-5），表示用户 j 对电影 i 的评分

矩阵：

X-电影的数据集（第 i 行表示由电影 i 组成的特征）

Theta-用户的数据集（第 j 行表示用户 j 对各类电影喜好程度的特征）

main.py

from scipy.io import loadmat

data = loadmat('ex8_movies.mat')
Y, R = data['Y'], data['R']
# print(Y.shape, R.shape)  # (1682, 943) (1682, 943)
print(Y, R)

[[5 4 0 ... 5 0 0]
[3 0 0 ... 0 0 5]
[4 0 0 ... 0 0 0]
...
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]] [[1 1 0 ... 1 0 0]
[1 0 0 ... 0 0 1]
[1 0 0 ... 0 0 0]
...
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]
[0 0 0 ... 0 0 0]]

评估电影的平均评级

main.py

from scipy.io import loadmat
import numpy as np

data = loadmat('ex8_movies.mat')
Y, R = data['Y'], data['R']
print(Y[1, np.where(R[1, :] == 1)].mean())
# 3.2061068702290076

“可视化” 数据

main.py

from scipy.io import loadmat
import matplotlib.pyplot as plt

data = loadmat('ex8_movies.mat')
Y, R = data['Y'], data['R']
fig, ax = plt.subplots(figsize=(8, 6))
ax.imshow(Y)
ax.set_xlabel('Users')
ax.set_ylabel('Movies')
plt.show()

2.2 Collaborative filtering learning algorithm

内容：通过给定的一些用户评级电影的数据，并最小化代价函数，来学习参数X与Theta。

2.2.1 Collaborative filtering cost function

内容：

为了评估时间少一些，我们只取了少量的数据。（四位用户五部电影三个特征）

costFunction.py

import numpy as np

# 1.序列化
def serialize(X, theta):
    return np.concatenate((X.ravel(), theta.ravel()))

# 2.逆序列化
def deserialize(param, n_movies, n_users, n_features):
    return param[:n_movies * n_features].reshape(n_movies, n_features), param[n_movies * n_features:].reshape(n_users, n_features)

def costFunction(param, Y, R, n_features):
    n_movies, n_users = Y.shape
    X, theta = deserialize(param, n_movies, n_users, n_features)
    return np.sum(np.power(np.multiply(X @ theta.T - Y, R), 2)) / 2

main.py

from scipy.io import loadmat
from costFunction import *  # 代价函数

data = loadmat('ex8_movies.mat')
Y, R = data['Y'], data['R']
param_data = loadmat('ex8_movieParams.mat')
X, Theta = param_data['X'], param_data['Theta']
# 这里我们只取一小段数据
n_users = 4
n_movies = 5
n_features = 3
X_sub = X[:n_movies, :n_features]
Theta_sub = Theta[:n_users, :n_features]
Y_sub = Y[:n_movies, :n_users]
R_sub = R[:n_movies, :n_users]
param_sub = serialize(X_sub, Theta_sub)
print(costFunction(param_sub, Y_sub, R_sub, n_features))
param = serialize(X, Theta)
print(costFunction(param, Y, R, 10))

22.224603725685675
27918.64012454421

2.2.2 Collaborative filtering gradient

内容：计算梯度。

gradient.py

import numpy as np
from costFunction import *  # 序列化与逆序列化的方法

def gradient(param, Y, R, n_features):
    n_movies, n_users = Y.shape
    X, Theta = deserialize(param, n_movies, n_users, n_features)
    inner = np.multiply(X @ Theta.T - Y, R)
    X_grad = inner @ Theta
    Theta_grad = inner.T @ X
    return serialize(X_grad, Theta_grad)

main.py

from scipy.io import loadmat
from gradient import *  # 计算梯度

data = loadmat('ex8_movies.mat')
Y, R = data['Y'], data['R']
param_data = loadmat('ex8_movieParams.mat')
X, Theta = param_data['X'], param_data['Theta']
param = serialize(X, Theta)
n_movies, n_users = Y.shape
n_features = X.shape[1]
X_grad, Theta_grad = deserialize(gradient(param, Y, R, n_features), n_movies, n_users, n_features)
print(X_grad, Theta_grad)

[[-6.26184144 2.45936046 -6.87560329 ... -4.81611896 3.84341521
-1.88786696]
[-3.80931446 1.80494255 -2.63877955 ... -3.55580057 2.1709485
2.65129032]
[-3.13090116 2.54853961 0.23884578 ... -4.18778519 3.10538294
5.47323609]
...
[-1.04774171 0.99220776 -0.48920899 ... -0.75342146 0.32607323
-0.89053637]
[-0.7842118 0.76136861 -1.25614442 ... -1.05047808 1.63905435
-0.14891962]
[-0.38792295 1.06425941 -0.34347065 ... -2.04912884 1.37598855
0.19551671]] [[-1.54728877 9.0812347 -0.6421836 ... -3.92035321 5.66418748
1.16465605]
[-2.58829914 2.52342335 -1.52402705 ... -5.46793491 5.82479897
1.8849854 ]
[ 2.14588899 2.00889578 -4.32190712 ... -6.83365682 1.78952063
0.82886788]
...
[-4.59816821 3.63958389 -2.52909095 ... -3.50886008 2.99859566
0.64932177]
[-4.39652679 0.55036362 -1.98451805 ... -6.74723702 3.8538775
3.94901737]
[-3.75193726 1.44393885 -5.6925333 ... -6.56073746 5.20459188
2.65003952]]

2.2.3 Regularized cost function

内容：正则化代价函数

regularizedContent.py

import numpy as np
from costFunction import *  # 引入未正则化的代价函数

def regularizedCostFunction(param, Y, R, n_features, learningRate):
    reg = (learningRate / 2) * np.power(param, 2).sum()
    return costFunction(param, Y, R, n_features) + reg

main.py

from scipy.io import loadmat
from gradient import *  # 计算梯度
from regularizedContent import *  # 带正则化的代价函数和梯度计算

data = loadmat('ex8_movies.mat')
Y, R = data['Y'], data['R']
param_data = loadmat('ex8_movieParams.mat')
X, Theta = param_data['X'], param_data['Theta']
n_users = 4
n_movies = 5
n_features = 3
X_sub = X[:n_movies, :n_features]
Theta_sub = Theta[:n_users, :n_features]
Y_sub = Y[:n_movies, :n_users]
R_sub = R[:n_movies, :n_users]
param_sub = serialize(X_sub, Theta_sub)
learningRate = 1.5
print(regularizedCostFunction(param_sub, Y_sub, R_sub, n_features, learningRate))
param = serialize(X, Theta)
print(regularizedCostFunction(param, Y, R, 10, learningRate=1))

31.34405624427422
32520.682450229557

2.2.4 Regularized gradient

内容：正则化梯度。

regularizedContent.py

import numpy as np
from costFunction import *  # 引入未正则化的代价函数
from gradient import *  # 引入未正则化的梯度计算

def regularizedCostFunction(param, Y, R, n_features, learningRate):
    reg = (learningRate / 2) * np.power(param, 2).sum()
    return costFunction(param, Y, R, n_features) + reg

def regularizedGradient(param, Y, R, n_features, learningRate):
    grad = gradient(param, Y, R, n_features)
    reg = learningRate * param
    return grad + reg

main.py

from scipy.io import loadmat
from gradient import *  # 计算梯度
from regularizedContent import *  # 带正则化的代价函数和梯度计算

data = loadmat('ex8_movies.mat')
Y, R = data['Y'], data['R']
param_data = loadmat('ex8_movieParams.mat')
X, Theta = param_data['X'], param_data['Theta']
n_movies, n_users = Y.shape
param = serialize(X, Theta)
learningRate = 1
X_grad, Theta_grad = deserialize(regularizedGradient(param, Y, R, 10, learningRate), n_movies, n_users, 10)
print(X_grad, Theta_grad)

[[-5.21315594 2.0591285 -5.68148384 ... -3.95439796 3.14612528
-1.59912133]
[-3.02846323 1.41931663 -2.11758176 ... -2.85177984 1.68511329
2.08666626]
[-2.4893923 2.00068576 0.1550494 ... -3.34923876 2.41055086
4.33843977]
...
[-0.82821934 0.7917289 -0.39662935 ... -0.60746963 0.28294163
-0.71223186]
[-0.62377152 0.60121466 -1.02043496 ... -0.84313998 1.30670669
-0.10603832]
[-0.31115177 0.86705203 -0.2616062 ... -1.64900127 1.08850949
0.16318173]] [[-1.26184516 7.39696961 -0.37924484 ... -3.15312086 4.55958584
0.91278897]
[-2.08328593 2.06877489 -1.20656461 ... -4.37487155 4.62450461
1.49336864]
[ 1.71397243 1.53009129 -3.47519602 ... -5.47031706 1.46428521
0.63418576]
...
[-3.53947561 2.83086629 -1.95973324 ... -2.70464586 2.25512788
0.52946291]
[-3.50593747 0.42141628 -1.62891338 ... -5.37296895 3.1012226
3.13766426]
[-2.92779591 1.05501291 -4.62312829 ... -5.27650042 4.22109195
2.11819114]]

2.3 Learning movie recommendations

内容：创建自己的电影评分，以生成个性化推荐。有一个提供连接电影索引到其标题的文件。

main.py

import numpy as np

# open打开一个文件
f = open('movie_ids.txt', encoding='gbk', errors='ignore')
movie_list = []
for line in f:
    tokens = line.strip().split(' ')
    movie_list.append(' '.join(tokens[1:]))  # 将电影名存入列表中
movie_list = np.array(movie_list)
print(movie_list[0])

Toy Story (1995)

2.3.1 Recommendations

内容：将自己的评级数据添加到原始数据中。训练数据，得到参数，推荐出用户可能喜欢的电影。

首先演示一下Python插入数据（非0即1，0则0）

test.py

import numpy as np

arr = np.array([
    [1, 1, 1],
    [1, 1, 0],
    [1, 1, 0]
]).reshape(3, 3)
rate = np.array([2, 3, 0]).reshape(3, 1)
arr = np.append(rate != 0, arr, axis=1)
print(arr)

[[1 1 1 1]
[1 1 1 0]
[0 1 1 0]]

对评级进行归一化处理，并训练数据。

main.py

import numpy as np
from scipy.io import loadmat
from scipy.optimize import minimize
from costFunction import *
from regularizedContent import *

# open打开一个文件
f = open('movie_ids.txt', encoding='gbk', errors='ignore')
movie_list = []
for line in f:
    tokens = line.strip().split(' ')
    movie_list.append(' '.join(tokens[1:]))  # 将电影名存入列表中
movie_list = np.array(movie_list)

ratings = np.zeros((1682, 1))  # 给定一些电影的评分
ratings[0] = 4
ratings[6] = 3
ratings[11] = 5
ratings[65] = 3
ratings[68] = 5
ratings[97] = 2
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5
# print(ratings.shape)  # (1682, 1)

# 1.我们现在成为user0
raw_data = loadmat('ex8_movies')
Y, R = raw_data['Y'], raw_data['R']
Y = np.append(ratings, Y, axis=1)
# print(Y.shape)
R = np.append(ratings != 0, R, axis=1)
# print(R.shape)  # (1682, 944)

n_movies, n_users = Y.shape
n_features = 10
learningRate = 10
# 2.初始化X,Theta参数
# np.random.random((a,b))-生成a行b列的浮点数(0-1随机)
X = np.random.random((n_movies, n_features))
Theta = np.random.random((n_users, n_features))
param = serialize(X, Theta)
# print(X.shape, Theta.shape, param.shape)  # (1682, 10) (944, 10) (26260,)

# 3.进行归一化
Y_mean = (Y.sum(axis=1) / R.sum(axis=1)).reshape(-1, 1)
# print(Y_mean.shape)  # (1682, 1)
Y_norm = np.multiply(Y - Y_mean, R)
# print(Y_norm.shape)  # (1682, 944)

# 4.训练数据
fmin = minimize(fun=regularizedCostFunction, x0=param, args=(Y_norm, R, n_features, learningRate), method='TNC',
                jac=regularizedGradient)
X_trained, Theta_trained = deserialize(fmin.x, n_movies, n_users, n_features)
# print(X_trained.shape, theta_trained.shape)  # (1682, 10) (944, 10)

# 5.得到训练出的数据后，给出所推荐的电影
predictions = X_trained @ Theta_trained.T
# print(predictions.shape)  # (1682, 944)
Y_mean = (Y.sum(axis=1) / R.sum(axis=1))
my_preds = predictions[:, 0] + Y_mean
# print(my_preds.shape)  # (1682,)
# argsort返回数组值从小到大的索引值
idx = np.argsort(my_preds)[::-1]  # 降序排列
for m in movie_list[idx][:5]:
    print(m)  # 取前五部高分的

Santa with Muscles (1996)
Great Day in Harlem, A (1994)
Entertaining Angels: The Dorothy Day Story (1996)
They Made Me a Criminal (1939)
Saint of Fort Washington, The (1993)