深度学习中,拟合模型的任务分为两个关键:
优化(optimization):用模型拟合观测数据的过程
泛化(generalization):数学原理和实践者的智慧,能够指导我们生成出有效性超出用于训练的数据集本身的模型
matplotlib是一个Python中流行的绘图库。要配置matplotlib生成图形的属性,我们需要定义几个函数:
use_svg_display函数指定matplotlib软件包输出svg图表以获得更清晰的图像。
def use_svg_display(): #@save
"""使用svg格式在Jupyter中显示绘图。"""
display.set_matplotlib_formats('svg')
set_figsize函数来设置图表大小.
def set_figsize(figsize=(3.5, 2.5)): #@save
"""设置matplotlib的图表大小。"""
use_svg_display()
d2l.plt.rcParams['figure.figsize'] = figsize
set_axes函数用于设置由matplotlib生成图表的轴的属性。
#@save
def set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend):
"""设置matplotlib的轴。"""
axes.set_xlabel(xlabel)
axes.set_ylabel(ylabel)
axes.set_xscale(xscale)
axes.set_yscale(yscale)
axes.set_xlim(xlim)
axes.set_ylim(ylim)
if legend:
axes.legend(legend)
axes.grid()
通过这三个用于图形配置的函数,我们定义了plot函数来简洁地绘制多条曲线。
#@save
def plot(X, Y=None, xlabel=None, ylabel=None, legend=None, xlim=None, ylim=None, xscale='linear', yscale='linear', fmts=('-', 'm--', 'g-.', 'r:'), figsize=(3.5, 2.5)), axes=None):
"""绘制数据点。"""
if legend is None:
legend = []
set_figsize(figsize)
axes = axes if axes else d2l.plt.gca()
#如果‘X’有一个轴,输出True
def has_one_axis(X)
return (hasattr(X, "ndim") and X.ndim == 1 or isinstance(X, list)and not hasattr(X[0], "__len__"))
if has_one_axis(X):
X = [X]
if Y is None:
X, Y = [[]] * len(X), X
elif has_one_axis(Y):
Y = [Y]
if len(X) != len(Y):
X = X * len(Y)
axes.cla()
for x, y, fmt in zip(X, Y, fmts):
if len(x):
axes.plot(x, y, fmt)
else:
axes.plot(y, fmt)
set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
调用以上函数,绘制函数u在x=1处的切线
x = np.arange(0, 3, 0.1) plot(x, [f(x), 2 * x - 3], 'x', 'f(x)', legend=['f(x)', 'Tangent line (x = 1)'])
下面是一个简单的使用Pytorch来构建LSTM模型的例子。
我们使用正弦函数和余弦函数来构造时间序列,而正余弦函数之间是导数关系,所以我们可以构造模型来学习正弦函数与余弦函数之间的映射关系,通过输入正弦函数的值来预测对应的余弦函数的值。因此,将正弦函数的值作为LSTM的输入,来预测余弦函数的值。
# -*- coding:UTF-8 -*-
import numpy as np
import torch
from torch import nn
import matplotlib.pyplot as plt
# Define LSTM Neural Networks
class LstmRNN(nn.Module):
"""
Parameters:
- input_size: feature size
- hidden_size: number of hidden units
- output_size: number of output
- num_layers: layers of LSTM to stack
"""
def __init__(self, input_size, hidden_size=1, output_size=1, num_layers=1):
super().__init__()
self.lstm = nn.LSTM(input_size, hidden_size, num_layers) # utilize the LSTM model in torch.nn
self.forwardCalculation = nn.Linear(hidden_size, output_size)
def forward(self, _x):
x, _ = self.lstm(_x) # _x is input, size (seq_len, batch, input_size)
s, b, h = x.shape # x is output, size (seq_len, batch, hidden_size)
x = x.view(s*b, h)
x = self.forwardCalculation(x)
x = x.view(s, b, -1)
return x
if __name__ == '__main__':
# create database
data_len = 200
t = np.linspace(0, 12*np.pi, data_len)
sin_t = np.sin(t)
cos_t = np.cos(t)
dataset = np.zeros((data_len, 2))
dataset[:,0] = sin_t
dataset[:,1] = cos_t
dataset = dataset.astype('float32')
# plot part of the original dataset
plt.figure()
plt.plot(t[0:60], dataset[0:60,0], label='sin(t)')
plt.plot(t[0:60], dataset[0:60,1], label = 'cos(t)')
plt.plot([2.5, 2.5], [-1.3, 0.55], 'r--', label='t = 2.5') # t = 2.5
plt.plot([6.8, 6.8], [-1.3, 0.85], 'm--', label='t = 6.8') # t = 6.8
plt.xlabel('t')
plt.ylim(-1.2, 1.2)
plt.ylabel('sin(t) and cos(t)')
plt.legend(loc='upper right')
# choose dataset for training and testing
train_data_ratio = 0.5 # Choose 80% of the data for testing
train_data_len = int(data_len*train_data_ratio)
train_x = dataset[:train_data_len, 0]
train_y = dataset[:train_data_len, 1]
INPUT_FEATURES_NUM = 1
OUTPUT_FEATURES_NUM = 1
t_for_training = t[:train_data_len]
# test_x = train_x
# test_y = train_y
test_x = dataset[train_data_len:, 0]
test_y = dataset[train_data_len:, 1]
t_for_testing = t[train_data_len:]
# ----------------- train -------------------
train_x_tensor = train_x.reshape(-1, 5, INPUT_FEATURES_NUM) # set batch size to 5
train_y_tensor = train_y.reshape(-1, 5, OUTPUT_FEATURES_NUM) # set batch size to 5
# transfer data to pytorch tensor
train_x_tensor = torch.from_numpy(train_x_tensor)
train_y_tensor = torch.from_numpy(train_y_tensor)
# test_x_tensor = torch.from_numpy(test_x)
lstm_model = LstmRNN(INPUT_FEATURES_NUM, 16, output_size=OUTPUT_FEATURES_NUM, num_layers=1) # 16 hidden units
print('LSTM model:', lstm_model)
print('model.parameters:', lstm_model.parameters)
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(lstm_model.parameters(), lr=1e-2)
max_epochs = 10000
for epoch in range(max_epochs):
output = lstm_model(train_x_tensor)
loss = loss_function(output, train_y_tensor)
loss.backward()
optimizer.step()
optimizer.zero_grad()
if loss.item() < 1e-4:
print('Epoch [{}/{}], Loss: {:.5f}'.format(epoch+1, max_epochs, loss.item()))
print("The loss value is reached")
break
elif (epoch+1) % 100 == 0:
print('Epoch: [{}/{}], Loss:{:.5f}'.format(epoch+1, max_epochs, loss.item()))
# prediction on training dataset
predictive_y_for_training = lstm_model(train_x_tensor)
predictive_y_for_training = predictive_y_for_training.view(-1, OUTPUT_FEATURES_NUM).data.numpy()
# torch.save(lstm_model.state_dict(), 'model_params.pkl') # save model parameters to files
# ----------------- test -------------------
# lstm_model.load_state_dict(torch.load('model_params.pkl')) # load model parameters from files
lstm_model = lstm_model.eval() # switch to testing model
# prediction on test dataset
test_x_tensor = test_x.reshape(-1, 5, INPUT_FEATURES_NUM) # set batch size to 5, the same value with the training set
test_x_tensor = torch.from_numpy(test_x_tensor)
predictive_y_for_testing = lstm_model(test_x_tensor)
predictive_y_for_testing = predictive_y_for_testing.view(-1, OUTPUT_FEATURES_NUM).data.numpy()
# ----------------- plot -------------------
plt.figure()
plt.plot(t_for_training, train_x, 'g', label='sin_trn')
plt.plot(t_for_training, train_y, 'b', label='ref_cos_trn')
plt.plot(t_for_training, predictive_y_for_training, 'y--', label='pre_cos_trn')
plt.plot(t_for_testing, test_x, 'c', label='sin_tst')
plt.plot(t_for_testing, test_y, 'k', label='ref_cos_tst')
plt.plot(t_for_testing, predictive_y_for_testing, 'm--', label='pre_cos_tst')
plt.plot([t[train_data_len], t[train_data_len]], [-1.2, 4.0], 'r--', label='separation line') # separation line
plt.xlabel('t')
plt.ylabel('sin(t) and cos(t)')
plt.xlim(t[0], t[-1])
plt.ylim(-1.2, 4)
plt.legend(loc='upper right')
plt.text(14, 2, "train", size = 15, alpha = 1.0)
plt.text(20, 2, "test", size = 15, alpha = 1.0)
plt.show()
2 动手学大数据
2.1 群起集群并测试
启动前配置workers,集群中有几台节点,workers中就配置几个主机名称。
vim /opt/module/hadoop-3.1.3/etc/hadoop/workers
在该文件中增加如下内容:
hadoop102
hadoop103
hadoop104
该文件中添加的内容结尾不允许有空格,文件中不允许有空行,否则无法正确识别主机名称
同步所有节点的配置文件
xsync /opt/module/hadoop-3.1.3/etc/hadoop/workers
启动集群
先进入Hadoop安装目录
cd $HADOOP_HOME
如果集群是第一次启动,需要在hadoop102节点格式化NameNode
格式化NameNode,会产生新的集群id,导致NameNode和DataNode的集群id不一致,集群找不到以往数据。如果集群在运行过程中报错,需要重新格式化NameNode的话,一定要先停止NameNode和DataNode进程,并且要删除所有机器的data和logs目录,然后再进行格式化。
初始化命令:
hdfs namenode -format
初始化后可以看见新增data文件夹和logs文件夹,在data->dfs->name->current中,VERSION文件记录了namespaceID, clusterID等重要信息。
启动命令:
sbin/start-dfs.sh
可以看到NameNode、DataNode和Secondarynamenode都在启动,如下图所示
验证是否启动成功
jps
在上一篇博文中,规划了集群的hdfs如何设置:三台机器均为DataNode,NameNode在hadoop102上,SecondaryNameNode在hadoop104上,因此应注意查看服务器上相应功能是否启动。
再在hadoop103上启动ResourceManager并验证
sbin/start-yarn.sh
在hadoop103上启动yarn后,它自己启动resourcemanager,其它节点启动nodemanager
测试集群
测试一:上传小文件到集群
在hadoop102的$HADOOP_HOME中,执行命令:
hadoop fs -mkdir /wcinput
在浏览器输入 :启动dfs的服务器的IP:9870,访问实用工具Utilities->文件Browse the file system
hadoop fs -put wcinput/word.txt /wcinput
可在web端访问到wcinput文件夹中的word.txt
测试二:上传大文件到集群
将jdk上传至集群根目录
hadoop fs -put /opt/software/jdk-8u202-linux-x64.tar.gz /
-put表示上传,-get表示下载
2.2 数据库与大数据是什么关系从宏观层面来看,大数据应该属于数据库的一种形态。从更细粒度的角度来看,数据库主要用来处理联机交易和中等规模的数据分析,强调高性能低延时的数据存取。而大数据一般面向海量数据,以及基于这些数据从产生、收集、存储到计算的分布式计算框架,如Hadoop、Spark生态下的各种软件和框架。
3 总结与展望深度学习部分,认识了matplotlib这一Python中流行的绘图库,并动手实践了一个LSTM模型。大数据部分,对完全分布式集群做群起与测试。接下来将进行HDFS+MapReduce+Yarn模块的详细开发。



