def get_debt_rank(self):
'''
统计各年龄段债务情况(严重/不严重)
:return:
'''
all_list=[]
bin = [0, 30, 45, 60, 75, 100]
df_age_debt = df.select(df['age'], df['DebtRatio'])
age_debt_y = []
for i in range(5):
y0 = df_age_debt.filter(df['age'].between(bin[i], bin[i + 1])).
filter(df['DebtRatio'] < 1).count()
print(y0)
y1 = df_age_debt.filter(df['age'].between(bin[i], bin[i + 1])).
filter(df['DebtRatio'] >= 1).count()
print(y1)
age_debt_y.append([y0, y1])
all_list.append(age_debt_y)
# 数据可视化data_web.py
return all_list
统计各年龄情况
def XiangGuanXing(self):
feature=['y','age','30-59days','DebtRatio','MonthlyIncome','num_late_card','60-89days','RealEstateLoans','families']
heat_map_data_arr=[]
for i in range(len(feature)):
for j in range(len(feature)):
single_arr=[]
if(i!=j):
df_ = df.select(df[feature[i]].cast('float'), df[feature[j]].cast('float'))
result = df_.corr(feature[i], feature[j], 'pearson')
single_arr.append(i)
single_arr.append(j)
single_arr.append(result)
else:
single_arr.append(i)
single_arr.append(j)
single_arr.append(1)
heat_map_data_arr.append(single_arr)
#转换成热力图需要的数据格式
print(heat_map_data_arr)
return heat_map_data_arr
计算相关性
def get_score(self):
'''
各用户评分
:return:
'''
feature=['y','30-59days','DebtRatio','num_late_card','60-89days','RealEstateLoans']
pearson_arr=[]
sum=0
for i in range(len(feature)):
if (i != 0):
df_ = df.select(df[feature[i]].cast('float'), df['y'].cast('float'))
result = df_.corr(feature[i], 'y', 'pearson')
sum=sum+abs(result)
pearson_arr.append(result)
else:
pearson_arr.append(1)
sum=sum+1
precent_arr=[]
for temp in pearson_arr:
precent_arr.append(abs(temp)/sum)
print(precent_arr)
df_score=df.withColumn('score',df['y']*precent_arr[0]+df['30-59days']*precent_arr[1]+df['num_late_card']*precent_arr[2]
+df['60-89days']*precent_arr[3]+df['RealEstateLoans']*precent_arr[4])
df_single_score = df_score.select(df_score['ID'], df_score['score'])
df_sort_score=df_single_score.sort('score').collect()
best_customer=[]
for i in range(10):
single_best=[]
temp=df_sort_score[i]
temp_char=str(temp)
temp_char=temp_char.strip('Row()')
print(temp_char)
temp_arr=temp_char.split(',')
ID=temp_arr[0].strip("ID=''")
score=temp_arr[1].split('=')
print(ID,score[1])
single_best.append(int(ID))
single_best.append(float(score[1]))
best_customer.append(single_best)
print(best_customer)
worst_customer = []
for i in range(10):
len_arr=len(df_sort_score)
single_worst = []
temp = df_sort_score[len_arr-1-i]
temp_char = str(temp)
temp_char = temp_char.strip('Row()')
print(temp_char)
temp_arr = temp_char.split(',')
ID = temp_arr[0].strip("ID=''")
score = temp_arr[1].split('=')
print(ID, score[1])
single_worst.append(ID)
single_worst.append(score[1])
worst_customer.append(single_worst)
print(worst_customer)
return best_customer,worst_customer
统计得分情况



