栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

python数据科学入门(读书笔记)-线性代数、统计学、概率、假设与推断、梯度下降

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

python数据科学入门(读书笔记)-线性代数、统计学、概率、假设与推断、梯度下降

文章目录

线性代数

向量矩阵 统计学

中位数众数分位数极差分位数之差方差标准差协方差相关系数排除异常值 概率论

条件概率正态分布中心极限定理 假设与推断梯度下降

线性代数

一个包含三个数字的列表对应一个三维空间的向量

向量

向量的和

两个向量相加

def vector_add(v, w):
    return [v_i + w_i for v_i, w_i in zip(v, w)]

v = [1,2,3]
w = [3,4,5]
vw = vector_add(v,w)
print(vw)

'''
执行结果为
[4, 6, 8]
'''

多个向量相加

##方法一
def vector_add(v, w):
    return [v_i + w_i for v_i, w_i in zip(v, w)]

def vector_sum(vectors):
    result = vectors[0] # 从第一个向量开始
    for vector in vectors[1:]: # 之后遍历其他向量
        result = vector_add(result, vector) # 最后计入总和
    return result

v = [1,2,3]
w = [3,4,5]
u = [3,4,5]
vectors = [v,w,u]
print(vector_sum(vectors))

'''
执行结果为
[7, 10, 13]
'''

#方法二
from functools import reduce

def vector_add(v, w):
    return [v_i + w_i for v_i, w_i in zip(v, w)]
    
def vector_sum(vectors):
    return reduce(vector_add, vectors)

v = [1,2,3]
w = [3,4,5]
u = [3,4,5]
vectors = [v,w,u]
print(vector_sum(vectors))

#方法三
from functools import partial
from functools import reduce

def vector_add(v, w):
    return [v_i + w_i for v_i, w_i in zip(v, w)]

vector_sum = partial(reduce, vector_add)

v = [1,2,3]
w = [3,4,5]
u = [5,4,5]
vectors = [v,w,u]
print(vector_sum(vectors))

其中:
functools模块为高阶函数提供支持,reduce函数、partial函数皆在其中,具体用法见此文
https://blog.csdn.net/qq_33688922/article/details/91890142

向量的乘积

向量乘以标量

def scalar_multiply(c, v):
    return [c * v_i for v_i in v]

一系列向量乘以标量(向量的均值)

def scalar_multiply(c, v):
    return [c * v_i for v_i in v]

def vector_mean(vectors):
    n = len(vectors)
    return scalar_multiply(1/n, vector_sum(vectors))

向量的点乘

def dot(v, w):
    return sum(v_i * w_i for v_i, w_i in zip(v, w))
    
"""执行结果为:v_1 * w_1 + ... + v_n * w_n"""

计算两个向量之间的距离

两个向量之间的距离 = ( v 1 − w 1 ) 2 + ⋅ ⋅ ⋅ + ( v n − w n ) 2 sqrt {(v_1-w_1)^2+···+(v_n-w_n)^2} (v1​−w1​)2+⋅⋅⋅+(vn​−wn​)2 ​

import math
# v_i * w_i
def dot(v, w):
    return sum(v_i * w_i for v_i, w_i in zip(v, w))
# v_i - w_i
def vector_subtract(v, w):
    return [v_i - w_i for v_i, w_i in zip(v, w)]
# v^2
def sum_of_squares(v):
    return dot(v, v)
'''
def magnitude(v):
    return math.sqrt(sum_of_squares(v)) # math.sqrt是平方根函数
'''
#平方和
def squared_distance(v, w):
    return sum_of_squares(vector_subtract(v, w))
#平方和开根号,距离
def distance(v, w):
    return math.sqrt(squared_distance(v, w))

v = [1,2,3]
w = [3,4,5]
vwdistance = distance(v,w)
print(vwdistance)

"""执行结果为:3.4641016151377544"""

矩阵

矩阵相当于 列表的列表,内部列表的大小一样,一个内部列表表示矩阵的一行

矩阵中各参数的意义
A[i][j]表示第i行第j列的元素
A[i]表示第i行的向量
A_i[j]表示第i行第j列的元素
len(A)表示矩阵的行数
len(A[0])表示矩阵的列数

def shape(A):
    num_rows = len(A)
    num_cols = len(A[0]) if A else 0 # 第一行中元素的个数
    return num_rows, num_cols

A = [[1, 2, 3],[4, 5, 6]]
print(shape(A))

对角线为1的5x5单位矩阵

def make_matrix(num_rows, num_cols, entry_fn):
    return [[entry_fn(i, j) # 根据i创建一个列表
             for j in range(num_cols)] # [entry_fn(i, 0), ... ]
            for i in range(num_rows)] # 为每一个i创建一个列表

def is_diagonal(i, j):
    return 1 if i == j else 0

identity_matrix = make_matrix(5, 5, is_diagonal)

print(identity_matrix)

统计学 中位数
num_friends = [100, 49, 41, 40, 25,]

def median(v):
    n = len(v)
    sorted_v = sorted(v)
    midpoint = n // 2
    if n % 2 == 1:
        return sorted_v[midpoint]# 如果是奇数,返回中间值
    else:
        lo = midpoint - 1# 如果是偶数,返回中间两个值的均值
        hi = midpoint
    return (sorted_v[lo] + sorted_v[hi]) / 2

print(median(num_friends))
众数
from collections import Counter

num_friends = [100, 49, 49, 40, 25,]

def mode(x):
    counts = Counter(x)
    max_count = max(counts.values())
    return [x_i for x_i, count in counts.items() if count == max_count]

print(mode(num_friends))
分位数
num_friends = [100, 49, 41, 40, 25,]

def quantile(x, p):
    p_index = int(p * len(x))
    return sorted(x)[p_index]

print(quantile(num_friends, 0.10), 
      quantile(num_friends, 0.25),
      quantile(num_friends, 0.75),
      quantile(num_friends, 0.90))
极差
num_friends = [100, 48, 49, 40, 25,]

def data_range(x):
    return max(x) - min(x)

print(data_range(num_friends))
分位数之差
num_friends = [100, 49, 41, 40, 25,]

def quantile(x, p):
    p_index = int(p * len(x))
    return sorted(x)[p_index]

def interquartile_range(x):
    return quantile(x, 0.75) - quantile(x, 0.25)

print(interquartile_range(num_friends))
方差
num_friends = [100, 48, 49, 40, 25,]

def mean(x):
    return sum(x) / len(x)

def de_mean(x):
    x_bar = mean(x)
    return [x_i - x_bar for x_i in x]

def dot(v, w):
    return sum(v_i * w_i for v_i, w_i in zip(v, w))

def sum_of_squares(v):
    return dot(v, v)

def variance(x):
    n = len(x)
    deviations = de_mean(x)
    return sum_of_squares(deviations) / (n - 1)

print(variance(num_friends))
标准差
import math

num_friends = [100, 49, 41, 40, 25,]

def mean(x):
    return sum(x) / len(x)

def de_mean(x):
    x_bar = mean(x)
    return [x_i - x_bar for x_i in x]

def dot(v, w):
    return sum(v_i * w_i for v_i, w_i in zip(v, w))

def sum_of_squares(v):
    return dot(v, v)

def variance(x):
    n = len(x)
    deviations = de_mean(x)
    return sum_of_squares(deviations) / (n - 1)

def standard_deviation(x):
    return math.sqrt(variance(x))

print(standard_deviation(num_friends) )
协方差
num_friends = [100, 49, 41, 40, 25,]
daily_minutes = [50, 30, 20, 21, 25,]

def mean(x):
    return sum(x) / len(x)

def de_mean(x):
    x_bar = mean(x)
    return [x_i - x_bar for x_i in x]

def dot(v, w):
    return sum(v_i * w_i for v_i, w_i in zip(v, w))

def covariance(x, y):
    n = len(x)
    return dot(de_mean(x), de_mean(y)) / (n - 1)

print(covariance(num_friends, daily_minutes) )
相关系数
import math

num_friends = [100, 49, 41, 40, 25,]
daily_minutes = [50, 30, 20, 21, 25,]

def mean(x):
    return sum(x) / len(x)

def de_mean(x):
    x_bar = mean(x)
    return [x_i - x_bar for x_i in x]

def dot(v, w):
    return sum(v_i * w_i for v_i, w_i in zip(v, w))

def covariance(x, y):
    n = len(x)
    return dot(de_mean(x), de_mean(y)) / (n - 1)

def sum_of_squares(v):
    return dot(v, v)

def variance(x):
    n = len(x)
    deviations = de_mean(x)
    return sum_of_squares(deviations) / (n - 1)

def standard_deviation(x):
    return math.sqrt(variance(x))

def correlation(x, y):
    stdev_x = standard_deviation(x)
    stdev_y = standard_deviation(y)
    if stdev_x > 0 and stdev_y > 0:
        return covariance(x, y) / stdev_x / stdev_y
    else:
        return 0 # 如果没有变动,相关系数为零

print(correlation(num_friends, daily_minutes))
排除异常值
outlier = num_friends.index(100) # outlier的索引

num_friends_good = [x for i, x in enumerate(num_friends) if i != outlier]
daily_minutes_good = [x for i, x in enumerate(daily_minutes) if i != outlier]

correlation(num_friends_good, daily_minutes_good) # 0.57

概率论 条件概率

生男生女

import random

def random_kid():
    return random.choice(["boy", "girl"])

both_girls = 0
older_girl = 0
either_girl = 0
random.seed(0)

for _ in range(10000):
    younger = random_kid()
    older = random_kid()
    if older == "girl":
        older_girl += 1
    if older == "girl" and younger == "girl":
        both_girls += 1
    if older == "girl" or younger == "girl":
        either_girl += 1
        
print ("P(both | older):", both_girls / older_girl)
print ("P(both | either): ", both_girls / either_girl) 
正态分布
import math
import matplotlib.pyplot as plt

def normal_pdf(x, mu=0, sigma=1):
    sqrt_two_pi = math.sqrt(2 * math.pi)
    return (math.exp(-(x-mu) ** 2 / 2 / sigma ** 2) / (sqrt_two_pi * sigma))

xs = [x / 10.0 for x in range(-50, 50)]
plt.plot(xs,[normal_pdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1')
plt.plot(xs,[normal_pdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2')
plt.plot(xs,[normal_pdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5')
plt.plot(xs,[normal_pdf(x,mu=-1) for x in xs],'-.',label='mu=-1,sigma=1')
plt.legend()
plt.title("多个正态分布的概率密度函数")
plt.show()

正态分布概率累积分布函数

import math
import matplotlib.pyplot as plt

def normal_cdf(x, mu=0,sigma=1):
    return (1 + math.erf((x - mu) / math.sqrt(2) / sigma)) / 2

xs = [x / 10.0 for x in range(-50, 50)]
plt.plot(xs,[normal_cdf(x,sigma=1) for x in xs],'-',label='mu=0,sigma=1')
plt.plot(xs,[normal_cdf(x,sigma=2) for x in xs],'--',label='mu=0,sigma=2')
plt.plot(xs,[normal_cdf(x,sigma=0.5) for x in xs],':',label='mu=0,sigma=0.5')
plt.plot(xs,[normal_cdf(x,mu=-1) for x in xs],'-.',label='mu=-1,sigma=1')
plt.legend(loc=4) # 底部右边
plt.title("多个正态分布的累积分布函数")
plt.show()


中心极限定理
import math
import matplotlib.pyplot as plt
from collections import Counter
import random

def normal_cdf(x, mu=0,sigma=1):
    return (1 + math.erf((x - mu) / math.sqrt(2) / sigma)) / 2

def bernoulli_trial(p):
    return 1 if random.random() < p else 0

def binomial(n, p):
    return sum(bernoulli_trial(p) for _ in range(n))

def make_hist(p, n, num_points):
    data = [binomial(n, p) for _ in range(num_points)]
    # 用条形图绘出实际的二项式样本
    histogram = Counter(data)
    plt.bar([x - 0.4 for x in histogram.keys()],
            [v / num_points for v in histogram.values()],
            0.8,
            color='0.75')
    mu = p * n
    sigma = math.sqrt(n * p * (1 - p))
    # 用线形图绘出正态近似
    xs = range(min(data), max(data) + 1)
    ys = [normal_cdf(i + 0.5, mu, sigma) - normal_cdf(i - 0.5, mu, sigma) for i in xs]
    plt.plot(xs,ys)
    plt.title("二项分布与正态近似")
    plt.show()

make_hist(0.75, 100, 10000)

假设与推断
梯度下降
转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/740772.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号