利用pyspark对于武汉租房数据进行分析,可以爬取不同地区套用本代码。
代码如下:
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType
import pandas as pd
from pyspark.ml.stat import Correlation
import matplotlib.pyplot as plt
spark = SparkSession.builder.master("local").appName("rent_analyse").getOrCreate()
df1 = spark.read.csv("zh.csv", header=True,encoding="UTF-8")
df1=df1.withColumn("租金",df1.租金.cast(IntegerType()))
df1=df1.withColumn("面积",df1.面积.cast(IntegerType()))
area=df1.select("区划").distinct().collect()
place=[]
for i in area:
temp=i.asDict()
l=list(temp.values())
place.append(l[0])
mean_price=[]
max_price=[]
min_price=[]
mean_size=[]
max_size=[]
min_size=[]
for i in range(len(place)):
temp=df1.filter(df1.区划==place[i])
mean_price.append(temp.agg({"租金":"mean"}).first()['avg(租金)'])
max_price.append(temp.agg({"租金":"max"}).first()['max(租金)'])
min_price.append(temp.agg({"租金":"min"}).first()['min(租金)'])
mean_size.append(temp.agg({"面积":"mean"}).first()['avg(面积)'])
max_size.append(temp.agg({"面积":"max"}).first()['max(面积)'])
min_size.append(temp.agg({"面积":"min"}).first()['min(面积)'])
import pyspark.sql.functions as F
df_new = df1.withColumn("性价比",F.col("租金")/F.col("面积"))
price_number=[]
for i in range(27):
temp=df1.filter((df1.租金>1000*i)&(df1.租金<=1000*(i+1)))
price_number.append(temp.count())
size_number=[]
for i in range(12):
temp=df1.filter((df1.面积>50*i)&(df1.面积<=50*(i+1)))
size_number.append(temp.count())
place_number=[]
for i in place:
temp=df1.filter(df1.区划==i)
place_number.append(temp.count())
subway_rate=[]
for i in place:
temp=df1.filter(df1.区划==i)
temp2=temp.filter(df1.附近地铁!="无")
subway_rate.append(temp2.count()/temp.count())
zhibiao=[]
zhibiao.append(mean_price)
zhibiao.append(max_price)
zhibiao.append(min_price)
zhibiao.append(mean_size)
zhibiao.append(max_size)
zhibiao.append(min_size)
zhibiao.append(place_number)
zhibiao.append(subway_rate)
for i in zhibiao:
plt.bar(place, i)
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False
plt.show()
from pyecharts import Bar
bar = Bar("武汉市租房租金概况")
for i in zhibiao:
bar.add(i,place,i,is_stack=True)
bar.render()



