原文在这里
'''
这部分代码是lwls.py文件,基于随机梯度下降方法,
这与CSDN大部分相关文章求解正规方程(中间需要求逆)的方法不同,
这里不需要求矩阵的逆。
但是,它带来的另外一个问题是如何调参,如何找到最好的参数…………
我暂时还没找到好方法……
欢迎留言评论
'''
# Locally Weighted Linear Regression is very inefficient because Parameters are calculated again for each test case
# But, it should give good results after tuning the hyper-parameter tau
import csv
import math
import numpy
def converge(t):
for i in t:
if abs(i) > epsilon:
return False
return True
def stochastic_gradient_descent(w,theta):
for _ in range(max_n):
for i in range(len(X_s)):
x = numpy.array(X_s[i])
t = [0]*len(theta)
for j in range(len(theta)):
t[j] = alpha*w[i]*(Y_s[i]-numpy.dot(numpy.array(theta),x))*x[j]
for j in range(len(theta)):
theta[j] = theta[j] + t[j]
# print(theta)
# if converge(t):
# return theta
return theta
def get_data(name):
data = []
with open(name, 'r') as csv_file:
reader = csv.reader(csv_file)
for row in reader:
for i in range(len(row)):
row[i] = float(row[i])
data = data + [row]
return data # 返回一个二维列表,里面的一个列表表示一个样本,样本不含常数项(1),含y值标签,且该标签在各样本的末尾处
def arrange_data(data): #把data文件中样本集(含特征和标签),划分成特征、标签,并将其返回
Xs = [[]]*len(data) # 二维列表
Ys = [[]]*len(data)
for i in range(len(data)):
Xs[i] = data[i][:-1]+[1] # 添加常数项
Ys[i] = data[i][-1]
return Xs,Ys
def weight(x_i,x): # 计算局部测试数据点特征x,与,训练数据的特征x_i,的权重
x_i = numpy.array(x_i)
x = numpy.array(x)
temp = x_i-x
temp = numpy.dot(temp,temp)
return math.exp(-1.0*temp/(2*tau*tau))
def get_weights(Xs,x): # 计算局部测试数据点特征x,与,整个训练集特征Xs,的权重,即该测试点的权重
# return [1]*len(Xs) # Uncomment If you want standard Linear Regression
weights = [0]*len(Xs)
for i in range(len(weights)):
weights[i] = weight(Xs[i],x)
return weights
def get_parameters(w,n): # 计算参数theta,以备后续get_prediction
theta = [0]*n
theta = stochastic_gradient_descent(w,theta) # 使用随机梯度下降更新每个测试数据点的参数,这里是不同于大多数代码的地方!
# print(theta)
return theta
def get_prediction(w,x):
theta = get_parameters(w,len(x))
prediction = numpy.dot(numpy.array(theta),numpy.array(x))
return prediction
data_train = get_data('hw_data_train.csv')
data_test = get_data('hw_data_test.csv')
X_s,Y_s = arrange_data(data_train)
Xts,Yts = arrange_data(data_test)
# (HYPER-)PARAMETERS
tau = 0.1 # Weight Parameter
alpha = 0.01 # Learning Rate
max_n = 1000 # Stochastic Gradient Descent Loops
epsilon = 0.0001 # Stochastic Gradient Descent Tolerance [not using here, though]
variance = float(0)
for i in range(len(Xts)):
x = Xts[i]
y = Yts[i]
w = get_weights(X_s,x)
prediction = get_prediction(w,x)
print("Actual: " + str(y) + " Predicted: " + str(prediction))
variance = variance + (prediction-y)**2
variance = variance/len(Xts)
print("Variance: ",variance)
# 这部分用来生成训练和测试数据
import numpy
num_train = 100
num_test = 20
with open("data_train.csv",'a') as file:
x = -5
for i in range(num_train):
y = (x)**2 + 0*numpy.random.normal(0,1)
file.write(str(x)+","+str(y)+"n")
x = x + 10/num_train
with open("data_test.csv",'a') as file:
x = -5
for i in range(num_test):
y = (x)**2 + 0*numpy.random.normal(0,1)
file.write(str(x)+","+str(y)+"n")
x = x + 10/num_test



