栏目分类:
子分类:
返回
名师互学网用户登录
快速导航关闭
当前搜索
当前分类
子分类
实用工具
热门搜索
名师互学网 > IT > 软件开发 > 后端开发 > Python

纽约出租车旅途时间建模分析

Python 更新时间: 发布时间: IT归档 最新发布 模块sitemap 名妆网 法律咨询 聚返吧 英语巴士网 伯小乐 网商动力

纽约出租车旅途时间建模分析

根据纽约出租车的运营数据,针对客户旅途时间展开分析与建模。
import os
import pandas as pd
import numpy as np
from matplotlib.pyplot import *
import matplotlib.pyplot as plt
from matplotlib import animation
from matplotlib import cm
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from dateutil import parser
import io
import base64
df = pd.read_csv('train.csv')
df.head()
---------------------------------------------------------------------------

FileNotFoundError                         Traceback (most recent call last)

 in ()
----> 1 df = pd.read_csv('train.csv')
      2 df.head()


~Anaconda3libsite-packagespandasioparsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    707                     skip_blank_lines=skip_blank_lines)
    708 
--> 709         return _read(filepath_or_buffer, kwds)
    710 
    711     parser_f.__name__ = name


~Anaconda3libsite-packagespandasioparsers.py in _read(filepath_or_buffer, kwds)
    447 
    448     # Create the parser.
--> 449     parser = TextFileReader(filepath_or_buffer, **kwds)
    450 
    451     if chunksize or iterator:


~Anaconda3libsite-packagespandasioparsers.py in __init__(self, f, engine, **kwds)
    816             self.options['has_index_names'] = kwds['has_index_names']
    817 
--> 818         self._make_engine(self.engine)
    819 
    820     def close(self):


~Anaconda3libsite-packagespandasioparsers.py in _make_engine(self, engine)
   1047     def _make_engine(self, engine='c'):
   1048         if engine == 'c':
-> 1049             self._engine = CParserWrapper(self.f, **self.options)
   1050         else:
   1051             if engine == 'python':


~Anaconda3libsite-packagespandasioparsers.py in __init__(self, src, **kwds)
   1693         kwds['allow_leading_cols'] = self.index_col is not False
   1694 
-> 1695         self._reader = parsers.TextReader(src, **kwds)
   1696 
   1697         # XXX


pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()


pandas/_libs/parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()


FileNotFoundError: File b'train.csv' does not exist

太远地方的就先去掉啦

xlim = [-74.03, -73.77]
ylim = [40.63, 40.85]
df = df[(df.pickup_longitude> xlim[0]) & (df.pickup_longitude < xlim[1])]
df = df[(df.dropoff_longitude> xlim[0]) & (df.dropoff_longitude < xlim[1])]
df = df[(df.pickup_latitude> ylim[0]) & (df.pickup_latitude < ylim[1])]
df = df[(df.dropoff_latitude> ylim[0]) & (df.dropoff_latitude < ylim[1])]

上下车地点集中区域

longitude = list(df.pickup_longitude) + list(df.dropoff_longitude)
latitude = list(df.pickup_latitude) + list(df.dropoff_latitude)
plt.figure(figsize = (10,10))
plt.plot(longitude,latitude,'.', alpha = 0.4, markersize = 0.05)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-HCsphdXx-1633489386722)(output_6_0.png)]

根据上下车的地点,将区域分一下,用聚类来试试

loc_df = pd.Dataframe()
loc_df['longitude'] = longitude
loc_df['latitude'] = latitude
kmeans = KMeans(n_clusters=15, random_state=2, n_init = 10).fit(loc_df)
loc_df['label'] = kmeans.labels_

loc_df = loc_df.sample(200000)
plt.figure(figsize = (10,10))
for label in loc_df.label.unique():
    plt.plot(loc_df.longitude[loc_df.label == label],loc_df.latitude[loc_df.label == label],'.', alpha = 0.3, markersize = 0.3)

plt.title('Clusters of New York')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-1yr91k0i-1633489386725)(output_9_0.png)]

给区域来个标记吧

fig,ax = plt.subplots(figsize = (10,10))
for label in loc_df.label.unique():
    ax.plot(loc_df.longitude[loc_df.label == label],loc_df.latitude[loc_df.label == label],'.', alpha = 0.4, markersize = 0.1, color = 'gray')
    ax.plot(kmeans.cluster_centers_[label,0],kmeans.cluster_centers_[label,1],'o', color = 'r')
    ax.annotate(label, (kmeans.cluster_centers_[label,0],kmeans.cluster_centers_[label,1]), color = 'b', fontsize = 20)
ax.set_title('Cluster Centers')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-pcDq1idT-1633489386727)(output_11_0.png)]

df['pickup_cluster'] = kmeans.predict(df[['pickup_longitude','pickup_latitude']])
df['dropoff_cluster'] = kmeans.predict(df[['dropoff_longitude','dropoff_latitude']])
df['pickup_hour'] = df.pickup_datetime.apply(lambda x: parser.parse(x).hour )
clusters = pd.Dataframe()
clusters['x'] = kmeans.cluster_centers_[:,0]
clusters['y'] = kmeans.cluster_centers_[:,1]
clusters['label'] = range(len(clusters))

loc_df = loc_df.sample(5000)

展示了方向与趋势,箭头的宽度与车流成正比

fig, ax = plt.subplots(1, 1, figsize = (10,10))

def animate(hour):
    ax.clear()
    ax.set_title('Relative Traffic - Hour ' + str(int(hour)) + ':00')    
    plt.figure(figsize = (10,10))
    for label in loc_df.label.unique():
        ax.plot(loc_df.longitude[loc_df.label == label],loc_df.latitude[loc_df.label == label],'.', alpha = 1, markersize = 2, color = 'gray')
        ax.plot(kmeans.cluster_centers_[label,0],kmeans.cluster_centers_[label,1],'o', color = 'r')


    for label in clusters.label:
        for dest_label in clusters.label:
            num_of_rides = len(df[(df.pickup_cluster == label) & (df.dropoff_cluster == dest_label) & (df.pickup_hour == hour)])
            dist_x = clusters.x[clusters.label == label].values[0] - clusters.x[clusters.label == dest_label].values[0]
            dist_y = clusters.y[clusters.label == label].values[0] - clusters.y[clusters.label == dest_label].values[0]
            pct = np.true_divide(num_of_rides,len(df[df.pickup_hour == hour]))
            arr = Arrow(clusters.x[clusters.label == label].values, clusters.y[clusters.label == label].values, -dist_x, -dist_y, edgecolor='white', width = pct)
            ax.add_patch(arr)
            arr.set_facecolor('g')


ani = animation.FuncAnimation(fig,animate,sorted(df.pickup_hour.unique()), interval = 1000)
plt.close()
ani.save('animation2.html', writer='imagemagick', fps=2)
e:ProgramDataAnaconda3libsite-packagesmatplotlibpyplot.py:523: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  max_open_warning, RuntimeWarning)
e:ProgramDataAnaconda3libsite-packagesmatplotlibanimation.py:1218: UserWarning: MovieWriter imagemagick unavailable
  warnings.warn("MovieWriter %s unavailable" % writer)

邻居分析

neighborhood = {-74.0019368351: 'Chelsea',-73.837549761: 'Queens',-73.7854240738: 'JFK',-73.9810421975:'Midtown-North-West',-73.9862336241: 'East Village',
                -73.971273324:'Midtown-North-East',-73.9866739677: 'Brooklyn-parkslope',-73.8690098118: 'LaGuardia',-73.9890572967:'Midtown',-74.0081765545: 'Downtown'
                ,-73.9213024854: 'Queens-Astoria',-73.9470256923: 'Harlem',-73.9555565018: 'Uppe East Side',
               -73.9453487097: 'Brooklyn-Williamsburgt',-73.9745967889:'Upper West Side'}
rides_df = pd.Dataframe(columns = neighborhood.values())
rides_df['name'] = neighborhood.values()

neigh = KNeighborsClassifier(n_neighbors=1)
neigh.fit(np.array(list(neighborhood.keys())).reshape(-1, 1), list(neighborhood.values()))
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=1, p=2,
           weights='uniform')
df['pickup_neighborhood'] = neigh.predict(df.pickup_longitude.reshape(-1,1))
df['dropoff_neighborhood'] = neigh.predict(df.dropoff_longitude.reshape(-1,1))

for col in rides_df.columns[:-1]:
    rides_df[col] = rides_df.name.apply(lambda x: len(df[(df.pickup_neighborhood == x) & (df.dropoff_neighborhood == col)]))
e:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:1: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
  """Entry point for launching an IPython kernel.
e:ProgramDataAnaconda3libsite-packagesipykernel_launcher.py:2: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead
rides_df.head()
ChelseaQueensJFKMidtown-North-WestEast VillageMidtown-North-EastBrooklyn-parkslopeLaGuardiaMidtownDowntownQueens-AstoriaHarlemUppe East SideBrooklyn-WilliamsburgtUpper West Sidename
0285262289501898910622765761411497359632208421192209799823178742Chelsea
12737593553720512033244315431541Queens
21887122127793578211623517431463320715761749993284712442208JFK
31749641621833083313214270056747420635307101962940465422343389823537Midtown-North-West
410616186116813532561980302622207316225598016251793709917049138East Village
import plotly.plotly
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

trace = go.Heatmap(z= np.array(rides_df.as_matrix()),
                  x = rides_df.columns[:-1],
                  y = rides_df.columns)
layout = dict(
    title = ' Neighborhoods Interaction',
    titlefont = dict(
    size = 30,
    color = ('rgb(100,100,100)')),
    margin = dict(t=100,r=100,b=100,l=150),
        yaxis = dict(
            title = '  From '),
        xaxis = dict(
            title = ' To '))
data=[trace]
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='labelled-heatmap')
---------------------------------------------------------------------------

ModuleNotFoundError                       Traceback (most recent call last)

 in ()
----> 1 import plotly.plotly
      2 import plotly.offline as py
      3 import plotly.graph_objs as go
      4 py.init_notebook_mode(connected=True)
      5 


ModuleNotFoundError: No module named 'plotly'

进出分析

fig,ax = plt.subplots(figsize = (12,12))
for i in range(len(rides_df)):  
    ax.plot(rides_df.sum(axis = 1)[i],rides_df.sum(axis = 0)[i],'o', color = 'b')
    ax.annotate(rides_df.index.tolist()[i], (rides_df.sum(axis = 1)[i],rides_df.sum(axis = 0)[i]), color = 'b', fontsize = 12)

ax.plot([0,250000],[0,250000], color = 'r', linewidth = 1)
ax.grid('off')
ax.set_xlim([0,250000])
ax.set_ylim([0,250000])
ax.set_xlabel('Outbound Taxis')
ax.set_ylabel('Inbound Taxis')
ax.set_title('Inbound and Outbound rides for each cluster')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-LBdcqkpS-1633489386732)(output_23_0.png)]

我们可以看到,每个地区的出入的比率是相对平衡的。

import pandas as pd  #pandas for using dataframe and reading csv 
import numpy as np   #numpy for vector operations and basic maths 
#import simplejson    #getting JSON in simplified format
import urllib        #for url stuff
#import gmaps       #for using google maps to visulalize places on maps
import re            #for processing regular expressions
import datetime      #for datetime operations
import calendar      #for calendar for datetime operations
import time          #to get the system time
import scipy         #for other dependancies
from sklearn.cluster import KMeans # for doing K-means clustering
from haversine import haversine # for calculating haversine distance
import math          #for basic maths operations
import seaborn as sns #for making plots
import matplotlib.pyplot as plt # for plotting
import os  # for os commands
from scipy.misc import imread, imresize, imsave  # for plots 
import plotly.plotly as py
import plotly.graph_objs as go
import plotly
from bokeh.palettes import Spectral4
from bokeh.plotting import figure, output_notebook, show
from IPython.display import HTML
from matplotlib.pyplot import *
from matplotlib import cm
from matplotlib import animation
import io
import base64
import warnings
warnings.filterwarnings("ignore")
output_notebook()
plotly.offline.init_notebook_mode() # run at the start of every ipython notebook
    
    Loading BokehJS ...





IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

数据读取与特征选择

s = time.time()
train_fr_1 = pd.read_csv('./data/fastest_routes_train_part_1.csv')
train_fr_2 = pd.read_csv('./data/fastest_routes_train_part_2.csv')
train_fr = pd.concat([train_fr_1, train_fr_2])
train_fr_new = train_fr[['id', 'total_distance', 'total_travel_time', 'number_of_steps']]
train_df = pd.read_csv('./data/train.csv')
train = pd.merge(train_df, train_fr_new, on = 'id', how = 'left')
train_df = train.copy()
end = time.time()
print("Time taken by above cell is {}.".format((end-s)))
train_df.head()
Time taken by above cell is 14.2900869846344.
idvendor_idpickup_datetimedropoff_datetimepassenger_countpickup_longitudepickup_latitudedropoff_longitudedropoff_latitudestore_and_fwd_flagtrip_durationtotal_distancetotal_travel_timenumber_of_steps
0id287542122016-03-14 17:24:552016-03-14 17:32:301-73.98215540.767937-73.96463040.765602N4552009.1164.95.0
1id237739412016-06-12 00:43:352016-06-12 00:54:381-73.98041540.738564-73.99948140.731152N6632513.2332.06.0
2id385852922016-01-19 11:35:242016-01-19 12:10:481-73.97902740.763939-74.00533340.710087N212411060.8767.616.0
3id350467322016-04-06 19:32:312016-04-06 19:39:401-74.01004040.719971-74.01226840.706718N4291779.4235.84.0
4id218102822016-03-26 13:30:552016-03-26 13:38:101-73.97305340.793209-73.97292340.782520N4351614.9140.15.0

数据检查

# checking if Ids are unique, 
train_data = train_df.copy()
print("Number of columns and rows and columns are {} and {} respectively.".format(train_data.shape[1], train_data.shape[0]))
if train_data.id.nunique() == train_data.shape[0]:
    print("Train ids are unique")
print("Number of Nulls - {}.".format(train_data.isnull().sum().sum()))
Number of columns and rows and columns are 14 and 1458644 respectively.
Train ids are unique
Number of Nulls - 3.

旅行持续时间log展示

%matplotlib inline
start = time.time()
sns.set(style="white", palette="muted", color_codes=True)
f, axes = plt.subplots(1, 1, figsize=(11, 7), sharex=True)
sns.despine(left=True)
sns.distplot(np.log(train_df['trip_duration'].values+1), axlabel = 'Log(trip_duration)', label = 'log(trip_duration)', bins = 50, color="r")
plt.setp(axes, yticks=[])
plt.tight_layout()
end = time.time()
print("Time taken by above cell is {}.".format((end-start)))
plt.show()
Time taken by above cell is 0.2782478332519531.

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-YrFRifAU-1633489386736)(output_31_1.png)]

正太分布的,有个别时间有点高的离谱了。。。有个别的神速了

print ('大部分的旅行时间是在:',np.exp(4)/60,np.exp(8)/60)
print ('比较吊的。。。',np.exp(2)/60,np.exp(12)/60)
大部分的旅行时间是在: 0.909969167219 49.6826331174
比较吊的。。。 0.123150934982 2712.57985698

数据提供的位置

start = time.time()
sns.set(style="white", palette="muted", color_codes=True)
f, axes = plt.subplots(2,2,figsize=(10, 10), sharex=False, sharey = False)
sns.despine(left=True)
sns.distplot(train_df['pickup_latitude'].values, label = 'pickup_latitude',color="m",bins = 100, ax=axes[0,0])
sns.distplot(train_df['pickup_longitude'].values, label = 'pickup_longitude',color="m",bins =100, ax=axes[0,1])
sns.distplot(train_df['dropoff_latitude'].values, label = 'dropoff_latitude',color="m",bins =100, ax=axes[1, 0])
sns.distplot(train_df['dropoff_longitude'].values, label = 'dropoff_longitude',color="m",bins =100, ax=axes[1, 1])
plt.setp(axes, yticks=[])
plt.tight_layout()
end = time.time()
print("Time taken by above cell is {}.".format((end-start)))
plt.show()
Time taken by above cell is 1.2390995025634766.

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-0wExblfP-1633489386738)(output_35_1.png)]

有些位置是不是太偏僻了,还是统计误差啊,去掉那些离谱的

纬度控制在40.6到40.9
经度控制在-74.05到-73.70

start = time.time()
df = train_df.loc[(train_df.pickup_latitude > 40.6) & (train_df.pickup_latitude < 40.9)]
df = df.loc[(df.dropoff_latitude>40.6) & (df.dropoff_latitude < 40.9)]
df = df.loc[(df.dropoff_longitude > -74.05) & (df.dropoff_longitude < -73.7)]
df = df.loc[(df.pickup_longitude > -74.05) & (df.pickup_longitude < -73.7)]
train_data_new = df.copy()
sns.set(style="white", palette="muted", color_codes=True)
f, axes = plt.subplots(2,2,figsize=(12, 12), sharex=False, sharey = False)#
sns.despine(left=True)
sns.distplot(train_data_new['pickup_latitude'].values, label = 'pickup_latitude',color="m",bins = 100, ax=axes[0,0])
sns.distplot(train_data_new['pickup_longitude'].values, label = 'pickup_longitude',color="g",bins =100, ax=axes[0,1])
sns.distplot(train_data_new['dropoff_latitude'].values, label = 'dropoff_latitude',color="m",bins =100, ax=axes[1, 0])
sns.distplot(train_data_new['dropoff_longitude'].values, label = 'dropoff_longitude',color="g",bins =100, ax=axes[1, 1])
plt.setp(axes, yticks=[])
plt.tight_layout()
end = time.time()
print("Time taken by above cell is {}.".format((end-start)))
print(df.shape[0], train_data.shape[0])
plt.show()
Time taken by above cell is 1.8928685188293457.
1452385 1458644

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-iC5agbNj-1633489386742)(output_38_1.png)]

以黑色为背景

temp = train_data.copy()
start = time.time()
rgb = np.zeros((3000, 3500, 3), dtype=np.uint8)
rgb[..., 0] = 0
rgb[..., 1] = 0
rgb[..., 2] = 0
train_data_new['pick_lat_new'] = list(map(int, (train_data_new['pickup_latitude'] - (40.6000))*10000))
train_data_new['drop_lat_new'] = list(map(int, (train_data_new['dropoff_latitude'] - (40.6000))*10000))
train_data_new['pick_lon_new'] = list(map(int, (train_data_new['pickup_longitude'] - (-74.050))*10000))
train_data_new['drop_lon_new'] = list(map(int,(train_data_new['dropoff_longitude'] - (-74.050))*10000))

train_data_new.head()
idvendor_idpickup_datetimedropoff_datetimepassenger_countpickup_longitudepickup_latitudedropoff_longitudedropoff_latitudestore_and_fwd_flagtrip_durationtotal_distancetotal_travel_timenumber_of_stepspick_lat_newdrop_lat_newpick_lon_newdrop_lon_new
0id287542122016-03-14 17:24:552016-03-14 17:32:301-73.98215540.767937-73.96463040.765602N4552009.1164.95.016791656678853
1id237739412016-06-12 00:43:352016-06-12 00:54:381-73.98041540.738564-73.99948140.731152N6632513.2332.06.013851311695505
2id385852922016-01-19 11:35:242016-01-19 12:10:481-73.97902740.763939-74.00533340.710087N212411060.8767.616.016391100709446
3id350467322016-04-06 19:32:312016-04-06 19:39:401-74.01004040.719971-74.01226840.706718N4291779.4235.84.011991067399377
4id218102822016-03-26 13:30:552016-03-26 13:38:101-73.97305340.793209-73.97292340.782520N4351614.9140.15.019321825769770
summary_plot = pd.Dataframe(train_data_new.groupby(['pick_lat_new', 'pick_lon_new'])['id'].count())

summary_plot.reset_index(inplace = True)
summary_plot.head()
pick_lat_newpick_lon_newid
025441
168401
284541
397061
41710301
lat_list = summary_plot['pick_lat_new'].unique()
for i in lat_list:
    lon_list = summary_plot.loc[summary_plot['pick_lat_new']==i]['pick_lon_new'].tolist()
    unit = summary_plot.loc[summary_plot['pick_lat_new']==i]['id'].tolist()
    for j in lon_list:
        a = unit[lon_list.index(j)]
        if (a//50) >0:
            rgb[i][j][0] = 255
            rgb[i,j, 1] = 0
            rgb[i,j, 2] = 255
        elif (a//10)>0:
            rgb[i,j, 0] = 0
            rgb[i,j, 1] = 255
            rgb[i,j, 2] = 0
        else:
            rgb[i,j, 0] = 255
            rgb[i,j, 1] = 0
            rgb[i,j, 2] = 0
fig, ax = plt.subplots(nrows=1,ncols=1,figsize=(14,20))
end = time.time()
print("Time taken by above cell is {}.".format((end-start)))
ax.imshow(rgb, cmap = 'hot')
ax.set_axis_off() 
Time taken by above cell is 4.935481071472168.

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-43bBGevN-1633489386745)(output_42_1.png)]

  • 红点表示在给定数据中的1-10次行程具有该点作为起始点
  • 绿点表示在给定数据中超过10-50次旅行具有该点作为起始点
  • 黄点表示在给定数据中超过50次以上的行程具有该点作为起始点

特征工程:
选择对旅途时间有影响的因素

#空间地理距离
start = time.time()
def haversine_(lat1, lng1, lat2, lng2):
    """function to calculate haversine distance between two co-ordinates"""
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return(h)

def manhattan_distance_pd(lat1, lng1, lat2, lng2):
    """function to calculate manhatten distance between pick_drop"""
    a = haversine_(lat1, lng1, lat1, lng2)
    b = haversine_(lat1, lng1, lat2, lng1)
    return a + b

import math
def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

end = time.time()
print("Time taken by above cell is {}.".format((end-start)))
Time taken by above cell is 0.0.
start = time.time()
train_data = temp.copy()
train_data['pickup_datetime'] = pd.to_datetime(train_data.pickup_datetime)
train_data.loc[:, 'pick_month'] = train_data['pickup_datetime'].dt.month
train_data.loc[:, 'hour'] = train_data['pickup_datetime'].dt.hour
train_data.loc[:, 'week_of_year'] = train_data['pickup_datetime'].dt.weekofyear
train_data.loc[:, 'day_of_year'] = train_data['pickup_datetime'].dt.dayofyear
train_data.loc[:, 'day_of_week'] = train_data['pickup_datetime'].dt.dayofweek
train_data.loc[:,'hvsine_pick_drop'] = haversine_(train_data['pickup_latitude'].values, train_data['pickup_longitude'].values, train_data['dropoff_latitude'].values, train_data['dropoff_longitude'].values)
train_data.loc[:,'manhtn_pick_drop'] = manhattan_distance_pd(train_data['pickup_latitude'].values, train_data['pickup_longitude'].values, train_data['dropoff_latitude'].values, train_data['dropoff_longitude'].values)
train_data.loc[:,'bearing'] = bearing_array(train_data['pickup_latitude'].values, train_data['pickup_longitude'].values, train_data['dropoff_latitude'].values, train_data['dropoff_longitude'].values)

end = time.time()
print("Time taken by above cell is {}.".format(end-start))

Time taken by above cell is 2.7582061290740967.
start = time.time()
def color(hour):
    """function for color change in animation"""
    return(10*hour)

def Animation(hour, temp, rgb):
    """Function to generate return a pic of plotings"""
    #ax.clear()
    train_data_new = temp.loc[temp['hour'] == hour]
    start = time.time()
    rgb = np.zeros((3000, 3500, 3), dtype=np.uint8)
    rgb[..., 0] = 0
    rgb[..., 1] = 0
    rgb[..., 2] = 0
    train_data_new['pick_lat_new'] = list(map(int, (train_data_new['pickup_latitude'] - (40.6000))*10000))
    train_data_new['drop_lat_new'] = list(map(int, (train_data_new['dropoff_latitude'] - (40.6000))*10000))
    train_data_new['pick_lon_new'] = list(map(int, (train_data_new['pickup_longitude'] - (-74.050))*10000))
    train_data_new['drop_lon_new'] = list(map(int,(train_data_new['dropoff_longitude'] - (-74.050))*10000))

    summary_plot = pd.Dataframe(train_data_new.groupby(['pick_lat_new', 'pick_lon_new'])['id'].count())

    summary_plot.reset_index(inplace = True)
    summary_plot.head(120)
    lat_list = summary_plot['pick_lat_new'].unique()
    for i in lat_list:
        #print(i)
        lon_list = summary_plot.loc[summary_plot['pick_lat_new']==i]['pick_lon_new'].tolist()
        unit = summary_plot.loc[summary_plot['pick_lat_new']==i]['id'].tolist()
        for j in lon_list:
            #j = int(j)
            a = unit[lon_list.index(j)]
            #print(a)
            if (a//50) >0:
                rgb[i][j][0] = 255 - color(hour)
                rgb[i,j, 1] = 255 - color(hour)
                rgb[i,j, 2] = 0 + color(hour)
            elif (a//10)>0:
                rgb[i,j, 0] = 0 + color(hour)
                rgb[i,j, 1] = 255 - color(hour)
                rgb[i,j, 2] = 0 + color(hour)
            else:
                rgb[i,j, 0] = 255 - color(hour)
                rgb[i,j, 1] = 0 + color(hour)
                rgb[i,j, 2] = 0 + color(hour)
    #fig, ax = plt.subplots(nrows=1,ncols=1,figsize=(14,20))
    end = time.time()
    print("Time taken by above cell is {} for {}.".format((end-start), hour))
    return(rgb)
end = time.time()
print("Time taken by above cell is {}.".format(end -start))
Time taken by above cell is 0.0.
start = time.time()
images_list=[]
train_data_new['pickup_datetime'] = pd.to_datetime(train_data_new.pickup_datetime)
train_data_new.loc[:, 'hour'] = train_data_new['pickup_datetime'].dt.hour

for i in list(range(0, 24)):
    im = Animation(i, train_data_new, rgb.copy())
    images_list.append(im)
end = time.time()
print("Time taken by above cell is {}.".format(end -start))
Time taken by above cell is 1.6389679908752441 for 0.
Time taken by above cell is 1.4990801811218262 for 1.
Time taken by above cell is 1.3021628856658936 for 2.
Time taken by above cell is 1.275221347808838 for 3.
Time taken by above cell is 1.2565860748291016 for 4.
Time taken by above cell is 1.2167680263519287 for 5.
Time taken by above cell is 1.4150054454803467 for 6.
Time taken by above cell is 1.555870771408081 for 7.
Time taken by above cell is 1.5852866172790527 for 8.
Time taken by above cell is 1.4900872707366943 for 9.
Time taken by above cell is 1.457101583480835 for 10.
Time taken by above cell is 1.4791648387908936 for 11.
Time taken by above cell is 1.4905309677124023 for 12.
Time taken by above cell is 1.471121072769165 for 13.
Time taken by above cell is 1.5651226043701172 for 14.
Time taken by above cell is 1.582446575164795 for 15.
Time taken by above cell is 1.5430455207824707 for 16.
Time taken by above cell is 1.8448097705841064 for 17.
Time taken by above cell is 1.6787598133087158 for 18.
Time taken by above cell is 1.6811716556549072 for 19.
Time taken by above cell is 1.634563684463501 for 20.
Time taken by above cell is 1.6993639469146729 for 21.
Time taken by above cell is 1.7138051986694336 for 22.
Time taken by above cell is 1.6760783195495605 for 23.
Time taken by above cell is 38.207443714141846.
start = time.time()
def build_gif(imgs = images_list, show_gif=False, save_gif=True, title=''):
    """function to create a gif of heatmaps"""
    fig, ax = plt.subplots(nrows=1,ncols=1,figsize=(10,10))
    ax.set_axis_off()
    hr_range = list(range(0,24))
    def show_im(pairs):
        ax.clear()
        ax.set_title('Absolute Traffic - Hour ' + str(int(pairs[0])) + ':00')
        ax.imshow(pairs[1])
        ax.set_axis_off() 
    pairs = list(zip(hr_range, imgs))
    #ims = map(lambda x: (ax.imshow(x), ax.set_title(title)), imgs)
    im_ani = animation.FuncAnimation(fig, show_im, pairs,interval=500, repeat_delay=0, blit=False)
    plt.cla()
    if save_gif:
        im_ani.save('animation.html', writer='imagemagick') #, writer='imagemagick'
    if show_gif:
        plt.show()
    return
end = time.time()
print("Time taken by above cell is {}".format(end-start))
Time taken by above cell is 0.0
start = time.time()
build_gif()
end = time.time()
print(end-start)
7.758885860443115

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-nujOayfB-1633489386747)(output_50_1.png)]

特征解释

start = time.time()
summary_wdays_avg_duration = pd.Dataframe(train_data.groupby(['vendor_id','day_of_week'])['trip_duration'].mean())
summary_wdays_avg_duration.reset_index(inplace = True)
summary_wdays_avg_duration['unit']=1
sns.set(style="white", palette="muted", color_codes=True)
sns.set_context("poster")
sns.tsplot(data=summary_wdays_avg_duration, time="day_of_week", unit = "unit", condition="vendor_id", value="trip_duration")
sns.despine(bottom = False)
end = time.time()
print(end - start)
0.24365997314453125

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-l1Vmzxe1-1633489386749)(output_52_1.png)]

显而易见的是,出租车1类在一周中的所有日子里花费的时间都多于出租车2类,平均差不多多了250秒

vovin plot

import seaborn as sns
sns.set(style="whitegrid", palette="pastel", color_codes=True)
sns.set_context("poster")
train_data2 = train_data.copy()
train_data2['trip_duration']= np.log(train_data['trip_duration'])
sns.violinplot(x="passenger_count", y="trip_duration", hue="vendor_id", data=train_data2, split=True,
               inner="quart",palette={1: "g", 2: "r"})

sns.despine(left=True)
print(df.shape[0])
1452385

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-6RDCnsQU-1633489386753)(output_55_1.png)]

  • 空载是在刷单吗。。。
  • 载客人数的分布情况差不多|

Box-Plots

start = time.time()
sns.set(style="ticks")
sns.set_context("poster")
sns.boxplot(x="day_of_week", y="trip_duration", hue="vendor_id", data=train_data, palette="PRGn")
plt.ylim(0, 6000)
plt.legend(loc = 'upper right')
sns.despine(offset=10, trim=True)
print(train_data.trip_duration.max())
end = time.time()
print("Time taken by above cell is {}.".format(end-start))
3526282
Time taken by above cell is 0.3499119281768799.

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-0uC9HxJz-1633489386755)(output_58_1.png)]

  • 周六日的出行时间更短一些

line-plots

summary_hour_duration = pd.Dataframe(train_data.groupby(['day_of_week','hour'])['trip_duration'].mean())
summary_hour_duration.reset_index(inplace = True)
summary_hour_duration['unit']=1
sns.set(style="white", palette="muted", color_codes=False)
sns.set_context("poster")
sns.tsplot(data=summary_hour_duration, time="hour", unit = "unit", condition="day_of_week", value="trip_duration")
sns.despine(bottom = False)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-H102jfam-1633489386756)(output_61_0.png)]

  • 周六日在5点到15点之间还是比较快的

聚类

start = time.time()
def assign_cluster(df, k):
    """function to assign clusters """
    df_pick = df[['pickup_longitude','pickup_latitude']]
    df_drop = df[['dropoff_longitude','dropoff_latitude']]
    """I am using initialization as from the output of
    k-means from my local machine to save time in this kernel"""
    init = np.array([[ -73.98737616,   40.72981533],
       [-121.93328857,   37.38933945],
       [ -73.78423222,   40.64711269],
       [ -73.9546417 ,   40.77377538],
       [ -66.84140269,   36.64537175],
       [ -73.87040541,   40.77016484],
       [ -73.97316185,   40.75814346],
       [ -73.98861094,   40.7527791 ],
       [ -72.80966949,   51.88108444],
       [ -76.99779701,   38.47370625],
       [ -73.96975298,   40.69089596],
       [ -74.00816622,   40.71414939],
       [ -66.97216034,   44.37194443],
       [ -61.33552933,   37.85105133],
       [ -73.98001393,   40.7783577 ],
       [ -72.00626526,   43.20296402],
       [ -73.07618713,   35.03469086],
       [ -73.95759366,   40.80316361],
       [ -79.20167796,   41.04752096],
       [ -74.00106031,   40.73867723]])
    k_means_pick = KMeans(n_clusters=k, init=init, n_init=1)
    k_means_pick.fit(df_pick)
    clust_pick = k_means_pick.labels_
    df['label_pick'] = clust_pick.tolist()
    df['label_drop'] = k_means_pick.predict(df_drop)
    return df, k_means_pick

end = time.time()
print("time taken by thie script by now is {}.".format(end-start))
time taken by thie script by now is 0.0005013942718505859.
start = time.time()
train_cl, k_means = assign_cluster(train_data, 20)  # make it 100 when extracting features 
centroid_pickups = pd.Dataframe(k_means.cluster_centers_, columns = ['centroid_pick_long', 'centroid_pick_lat'])
centroid_dropoff = pd.Dataframe(k_means.cluster_centers_, columns = ['centroid_drop_long', 'centroid_drop_lat'])
centroid_pickups['label_pick'] = centroid_pickups.index
centroid_dropoff['label_drop'] = centroid_dropoff.index
#centroid_pickups.head()
train_cl = pd.merge(train_cl, centroid_pickups, how='left', on=['label_pick'])
train_cl = pd.merge(train_cl, centroid_dropoff, how='left', on=['label_drop'])
#train_cl.head()
end = time.time()
print("Time taken in clustering is {}.".format(end - start))
Time taken in clustering is 2.5313637256622314.

聚类相关特征

  • 上下客点所在簇中心点的距离
  • 方向特征 - 集群质心之间的方向
start = time.time()
train_cl.loc[:,'hvsine_pick_cent_p'] = haversine_(train_cl['pickup_latitude'].values, train_cl['pickup_longitude'].values, train_cl['centroid_pick_lat'].values, train_cl['centroid_pick_long'].values)
train_cl.loc[:,'hvsine_drop_cent_d'] = haversine_(train_cl['dropoff_latitude'].values, train_cl['dropoff_longitude'].values, train_cl['centroid_drop_lat'].values, train_cl['centroid_drop_long'].values)
train_cl.loc[:,'hvsine_cent_p_cent_d'] = haversine_(train_cl['centroid_pick_lat'].values, train_cl['centroid_pick_long'].values, train_cl['centroid_drop_lat'].values, train_cl['centroid_drop_long'].values)
train_cl.loc[:,'manhtn_pick_cent_p'] = manhattan_distance_pd(train_cl['pickup_latitude'].values, train_cl['pickup_longitude'].values, train_cl['centroid_pick_lat'].values, train_cl['centroid_pick_long'].values)
train_cl.loc[:,'manhtn_drop_cent_d'] = manhattan_distance_pd(train_cl['dropoff_latitude'].values, train_cl['dropoff_longitude'].values, train_cl['centroid_drop_lat'].values, train_cl['centroid_drop_long'].values)
train_cl.loc[:,'manhtn_cent_p_cent_d'] = manhattan_distance_pd(train_cl['centroid_pick_lat'].values, train_cl['centroid_pick_long'].values, train_cl['centroid_drop_lat'].values, train_cl['centroid_drop_long'].values)

train_cl.loc[:,'bearing_pick_cent_p'] = bearing_array(train_cl['pickup_latitude'].values, train_cl['pickup_longitude'].values, train_cl['centroid_pick_lat'].values, train_cl['centroid_pick_long'].values)
train_cl.loc[:,'bearing_drop_cent_p'] = bearing_array(train_cl['dropoff_latitude'].values, train_cl['dropoff_longitude'].values, train_cl['centroid_drop_lat'].values, train_cl['centroid_drop_long'].values)
train_cl.loc[:,'bearing_cent_p_cent_d'] = bearing_array(train_cl['centroid_pick_lat'].values, train_cl['centroid_pick_long'].values, train_cl['centroid_drop_lat'].values, train_cl['centroid_drop_long'].values)
train_cl['speed_hvsn'] = train_cl.hvsine_pick_drop/train_cl.total_travel_time
train_cl['speed_manhtn'] = train_cl.manhtn_pick_drop/train_cl.total_travel_time
end = time.time()
print("Time Taken by above cell is {}.".format(end-start))
train_cl.head()
Time Taken by above cell is 3.551389694213867.
idvendor_idpickup_datetimedropoff_datetimepassenger_countpickup_longitudepickup_latitudedropoff_longitudedropoff_latitudestore_and_fwd_flag...hvsine_drop_cent_dhvsine_cent_p_cent_dmanhtn_pick_cent_pmanhtn_drop_cent_dmanhtn_cent_p_cent_dbearing_pick_cent_pbearing_drop_cent_pbearing_cent_p_cent_dspeed_hvsnspeed_manhtn
0id287542122016-03-14 17:24:552016-03-14 17:32:301-73.98215540.767937-73.96463040.765602N...1.0985852.3198571.3386011.5498402.8225538.812218-138.980503165.6409150.0090870.010524
1id237739412016-06-12 00:43:352016-06-12 00:54:381-73.98041540.738564-73.99948140.731152N...0.8454481.5201911.5730520.9687022.144236-149.031278-9.113659-49.1746170.0054380.007321
2id385852922016-01-19 11:35:242016-01-19 12:10:481-73.97902740.763939-74.00533340.710087N...0.5089225.7185711.1354900.6906977.848844142.642889-28.669171-148.9072920.0083180.010687
3id350467322016-04-06 19:32:312016-04-06 19:39:401-74.01004040.719971-74.01226840.706718N...0.8888280.0000000.8050891.1614660.000000166.83771822.5150490.0000000.0063000.007046
4id218102822016-03-26 13:30:552016-03-26 13:38:101-73.97305340.793209-73.97292340.782520N...0.7558200.0000002.2378461.0603220.000000-160.438403-127.7462300.0000000.0084840.008561

5 rows × 39 columns

聚类可视化展示

start = time.time()
def cluster_summary(sum_df):
    """function to calculate summary of given list of clusters """
    #agg_func = {'trip_duration':'mean','label_drop':'count','bearing':'mean','id':'count'} # that's how you use agg function with groupby
    summary_avg_time = pd.Dataframe(sum_df.groupby('label_pick')['trip_duration'].mean())
    summary_avg_time.reset_index(inplace = True)
    summary_pref_clus = pd.Dataframe(sum_df.groupby(['label_pick', 'label_drop'])['id'].count())
    summary_pref_clus = summary_pref_clus.reset_index()
    summary_pref_clus = summary_pref_clus.loc[summary_pref_clus.groupby('label_pick')['id'].idxmax()]
    summary =pd.merge(summary_avg_time, summary_pref_clus, how = 'left', on = 'label_pick')
    summary = summary.rename(columns={'trip_duration':'avg_triptime'})
    return summary
end = time.time()
print("Time Taken by above cell is {}.".format(end-start))
Time Taken by above cell is 0.0005021095275878906.
import folium
def show_fmaps(train_data, path=1):
    """function to generate map and add the pick up and drop coordinates
    1. Path = 1 : Join pickup (blue) and drop(red) using a straight line
    """
    full_data = train_data
    summary_full_data = pd.Dataframe(full_data.groupby('label_pick')['id'].count())
    summary_full_data.reset_index(inplace = True)
    summary_full_data = summary_full_data.loc[summary_full_data['id']>70000]
    map_1 = folium.Map(location=[40.767937, -73.982155], zoom_start=10,tiles='Stamen Toner') # manually added centre
    new_df = train_data.loc[train_data['label_pick'].isin(summary_full_data.label_pick.tolist())].sample(50)
    new_df.reset_index(inplace = True, drop = True)
    for i in range(new_df.shape[0]):
        pick_long = new_df.loc[new_df.index ==i]['pickup_longitude'].values[0]
        pick_lat = new_df.loc[new_df.index ==i]['pickup_latitude'].values[0]
        dest_long = new_df.loc[new_df.index ==i]['dropoff_longitude'].values[0]
        dest_lat = new_df.loc[new_df.index ==i]['dropoff_latitude'].values[0]
        folium.Marker([pick_lat, pick_long]).add_to(map_1)
        folium.Marker([dest_lat, dest_long]).add_to(map_1)
    return map_1

重点的clusters:大于70000个记录

def clusters_map(clus_data, full_data, tile = 'OpenStreetMap', sig = 0, zoom = 12, circle = 0, radius_ = 30):
    """ function to plot clusters on map"""
    map_1 = folium.Map(location=[40.767937, -73.982155], zoom_start=zoom,tiles= tile) # 'Mapbox' 'Stamen Toner'
    summary_full_data = pd.Dataframe(full_data.groupby('label_pick')['id'].count())
    summary_full_data.reset_index(inplace = True)
    if sig == 1:
        summary_full_data = summary_full_data.loc[summary_full_data['id']>70000]
    sig_cluster = summary_full_data['label_pick'].tolist()
    clus_summary = cluster_summary(full_data)
    for i in sig_cluster:
        pick_long = clus_data.loc[clus_data.index ==i]['centroid_pick_long'].values[0]
        pick_lat = clus_data.loc[clus_data.index ==i]['centroid_pick_lat'].values[0]
        clus_no = clus_data.loc[clus_data.index ==i]['label_pick'].values[0]
        most_visited_clus = clus_summary.loc[clus_summary['label_pick']==i]['label_drop'].values[0]
        avg_triptime = clus_summary.loc[clus_summary['label_pick']==i]['avg_triptime'].values[0]
        pop = 'cluster = '+str(clus_no)+' & most visited cluster = ' +str(most_visited_clus) +' & avg triptime from this cluster =' + str(avg_triptime)
        if circle == 1:
            folium.CircleMarker(location=[pick_lat, pick_long], radius=radius_,
                    color='#F08080',
                    fill_color='#3186cc', popup=pop).add_to(map_1)
        folium.Marker([pick_lat, pick_long], popup=pop).add_to(map_1)
    return map_1
osm = show_fmaps(train_data, path=1)
osm
clus_map = clusters_map(centroid_pickups, train_cl, sig =0, zoom =3.2, circle =1, tile = 'Stamen Terrain')
clus_map
clus_map_sig = clusters_map(centroid_pickups, train_cl, sig =1, circle =1)
clus_map_sig

测试集进行相同的处理

test_df = pd.read_csv('./data/test.csv')
test_fr = pd.read_csv('./data/fastest_routes_test.csv')
test_fr_new = test_fr[['id', 'total_distance', 'total_travel_time', 'number_of_steps']]
test_df = pd.merge(test_df, test_fr_new, on = 'id', how = 'left')
test_df.head()
idvendor_idpickup_datetimepassenger_countpickup_longitudepickup_latitudedropoff_longitudedropoff_latitudestore_and_fwd_flagtotal_distancetotal_travel_timenumber_of_steps
0id300467212016-06-30 23:59:581-73.98812940.732029-73.99017340.756680N3795.9424.64
1id350535512016-06-30 23:59:531-73.96420340.679993-73.95980840.655403N2904.5200.04
2id121714112016-06-30 23:59:471-73.99743740.737583-73.98616040.729523N1499.5193.24
3id215012622016-06-30 23:59:411-73.95607040.771900-73.98642740.730469N7023.9494.811
4id159824512016-06-30 23:59:331-73.97021540.761475-73.96151040.755890N1108.2103.24

时间特征

start = time.time()
test_data = test_df.copy()
test_data['pickup_datetime'] = pd.to_datetime(test_data.pickup_datetime)
test_data.loc[:, 'pick_month'] = test_data['pickup_datetime'].dt.month
test_data.loc[:, 'hour'] = test_data['pickup_datetime'].dt.hour
test_data.loc[:, 'week_of_year'] = test_data['pickup_datetime'].dt.weekofyear
test_data.loc[:, 'day_of_year'] = test_data['pickup_datetime'].dt.dayofyear
test_data.loc[:, 'day_of_week'] = test_data['pickup_datetime'].dt.dayofweek
end = time.time()
print("Time taken by above cell is {}.".format(end-start))
Time taken by above cell is 0.8934004306793213.

距离特征

strat = time.time()
test_data.loc[:,'hvsine_pick_drop'] = haversine_(test_data['pickup_latitude'].values, test_data['pickup_longitude'].values, test_data['dropoff_latitude'].values, test_data['dropoff_longitude'].values)
test_data.loc[:,'manhtn_pick_drop'] = manhattan_distance_pd(test_data['pickup_latitude'].values, test_data['pickup_longitude'].values, test_data['dropoff_latitude'].values, test_data['dropoff_longitude'].values)
test_data.loc[:,'bearing'] = bearing_array(test_data['pickup_latitude'].values, test_data['pickup_longitude'].values, test_data['dropoff_latitude'].values, test_data['dropoff_longitude'].values)
end = time.time()
print("Time taken by above cell is {}.".format(end-strat))
Time taken by above cell is 0.3820157051086426.

聚类特征

start = time.time()
test_data['label_pick'] = k_means.predict(test_data[['pickup_longitude','pickup_latitude']])
test_data['label_drop'] = k_means.predict(test_data[['dropoff_longitude','dropoff_latitude']])
test_cl = pd.merge(test_data, centroid_pickups, how='left', on=['label_pick'])
test_cl = pd.merge(test_cl, centroid_dropoff, how='left', on=['label_drop'])
#test_cl.head()
end = time.time()
print("Time Taken by above cell is {}.".format(end-start))
Time Taken by above cell is 0.714956521987915.
start = time.time()
test_cl.loc[:,'hvsine_pick_cent_p'] = haversine_(test_cl['pickup_latitude'].values, test_cl['pickup_longitude'].values, test_cl['centroid_pick_lat'].values, test_cl['centroid_pick_long'].values)
test_cl.loc[:,'hvsine_drop_cent_d'] = haversine_(test_cl['dropoff_latitude'].values, test_cl['dropoff_longitude'].values, test_cl['centroid_drop_lat'].values, test_cl['centroid_drop_long'].values)
test_cl.loc[:,'hvsine_cent_p_cent_d'] = haversine_(test_cl['centroid_pick_lat'].values, test_cl['centroid_pick_long'].values, test_cl['centroid_drop_lat'].values, test_cl['centroid_drop_long'].values)
test_cl.loc[:,'manhtn_pick_cent_p'] = manhattan_distance_pd(test_cl['pickup_latitude'].values, test_cl['pickup_longitude'].values, test_cl['centroid_pick_lat'].values, test_cl['centroid_pick_long'].values)
test_cl.loc[:,'manhtn_drop_cent_d'] = manhattan_distance_pd(test_cl['dropoff_latitude'].values, test_cl['dropoff_longitude'].values, test_cl['centroid_drop_lat'].values, test_cl['centroid_drop_long'].values)
test_cl.loc[:,'manhtn_cent_p_cent_d'] = manhattan_distance_pd(test_cl['centroid_pick_lat'].values, test_cl['centroid_pick_long'].values, test_cl['centroid_drop_lat'].values, test_cl['centroid_drop_long'].values)

test_cl.loc[:,'bearing_pick_cent_p'] = bearing_array(test_cl['pickup_latitude'].values, test_cl['pickup_longitude'].values, test_cl['centroid_pick_lat'].values, test_cl['centroid_pick_long'].values)
test_cl.loc[:,'bearing_drop_cent_p'] = bearing_array(test_cl['dropoff_latitude'].values, test_cl['dropoff_longitude'].values, test_cl['centroid_drop_lat'].values, test_cl['centroid_drop_long'].values)
test_cl.loc[:,'bearing_cent_p_cent_d'] = bearing_array(test_cl['centroid_pick_lat'].values, test_cl['centroid_pick_long'].values, test_cl['centroid_drop_lat'].values, test_cl['centroid_drop_long'].values)
test_cl['speed_hvsn'] = test_cl.hvsine_pick_drop/test_cl.total_travel_time
test_cl['speed_manhtn'] = test_cl.manhtn_pick_drop/test_cl.total_travel_time
end = time.time()
print("Time Taken by above cell is {}.".format(end-start))
Time Taken by above cell is 1.4610087871551514.
test_cl.head()
idvendor_idpickup_datetimepassenger_countpickup_longitudepickup_latitudedropoff_longitudedropoff_latitudestore_and_fwd_flagtotal_distance...hvsine_drop_cent_dhvsine_cent_p_cent_dmanhtn_pick_cent_pmanhtn_drop_cent_dmanhtn_cent_p_cent_dbearing_pick_cent_pbearing_drop_cent_pbearing_cent_p_cent_dspeed_hvsnspeed_manhtn
0id300467212016-06-30 23:59:581-73.98812940.732029-73.99017340.756680N3795.9...0.4607462.5578130.3164890.5727012.657047166.844127163.485813-2.2675630.0064680.006861
1id350535512016-06-30 23:59:531-73.96420340.679993-73.95980840.655403N2904.5...4.0357350.0000001.6809954.7861090.000000-21.064979-11.9832300.0000000.0137960.015524
2id121714112016-06-30 23:59:471-73.99743740.737583-73.98616040.729523N1499.5...0.1081941.5201910.4258010.1280542.144081-68.660063-78.185156130.8164730.0067610.009557
3id215012622016-06-30 23:59:411-73.95607040.771900-73.98642740.730469N7023.9...0.1176945.6223570.3279420.1664447.65760229.967965-134.876766-150.5838030.0106490.014477
4id159824512016-06-30 23:59:331-73.97021540.761475-73.96151040.755890N1108.2...1.0153840.0000000.6205671.2349230.000000-145.882420-75.6817250.0000000.0093100.013122

5 rows × 37 columns

xgboost模型

import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
import warnings

可以尝试加入PCA特征

# Lets Add PCA features in the model, reference Beluga's PCA
train = train_cl
test = test_cl
start = time.time()
coords = np.vstack((train[['pickup_latitude', 'pickup_longitude']].values,
                    train[['dropoff_latitude', 'dropoff_longitude']].values,
                    test[['pickup_latitude', 'pickup_longitude']].values,
                    test[['dropoff_latitude', 'dropoff_longitude']].values))

pca = PCA().fit(coords)
train['pickup_pca0'] = pca.transform(train[['pickup_latitude', 'pickup_longitude']])[:, 0]
train['pickup_pca1'] = pca.transform(train[['pickup_latitude', 'pickup_longitude']])[:, 1]
train['dropoff_pca0'] = pca.transform(train[['dropoff_latitude', 'dropoff_longitude']])[:, 0]
train['dropoff_pca1'] = pca.transform(train[['dropoff_latitude', 'dropoff_longitude']])[:, 1]
test['pickup_pca0'] = pca.transform(test[['pickup_latitude', 'pickup_longitude']])[:, 0]
test['pickup_pca1'] = pca.transform(test[['pickup_latitude', 'pickup_longitude']])[:, 1]
test['dropoff_pca0'] = pca.transform(test[['dropoff_latitude', 'dropoff_longitude']])[:, 0]
test['dropoff_pca1'] = pca.transform(test[['dropoff_latitude', 'dropoff_longitude']])[:, 1]
end = time.time()
print("Time Taken by above cell is {}.".format(end - start))
Time Taken by above cell is 1.553161382675171.
train['store_and_fwd_flag_int'] = np.where(train['store_and_fwd_flag']=='N', 0, 1)
test['store_and_fwd_flag_int'] = np.where(test['store_and_fwd_flag']=='N', 0, 1)
feature_names = list(train.columns)
print("Difference of features in train and test are {}".format(np.setdiff1d(train.columns, test.columns)))
print("")
do_not_use_for_training = ['pick_date','id', 'pickup_datetime', 'dropoff_datetime', 'trip_duration', 'store_and_fwd_flag']
feature_names = [f for f in train.columns if f not in do_not_use_for_training]
print("We will be using following features for training {}.".format(feature_names))
print("")
print("Total number of features are {}.".format(len(feature_names)))
Difference of features in train and test are ['dropoff_datetime' 'trip_duration']

We will be using following features for training ['vendor_id', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'total_distance', 'total_travel_time', 'number_of_steps', 'pick_month', 'hour', 'week_of_year', 'day_of_year', 'day_of_week', 'hvsine_pick_drop', 'manhtn_pick_drop', 'bearing', 'label_pick', 'label_drop', 'centroid_pick_long', 'centroid_pick_lat', 'centroid_drop_long', 'centroid_drop_lat', 'hvsine_pick_cent_p', 'hvsine_drop_cent_d', 'hvsine_cent_p_cent_d', 'manhtn_pick_cent_p', 'manhtn_drop_cent_d', 'manhtn_cent_p_cent_d', 'bearing_pick_cent_p', 'bearing_drop_cent_p', 'bearing_cent_p_cent_d', 'speed_hvsn', 'speed_manhtn', 'pickup_pca0', 'pickup_pca1', 'dropoff_pca0', 'dropoff_pca1', 'store_and_fwd_flag_int'].

Total number of features are 39.
y = np.log(train['trip_duration'].values + 1)
start = time.time()
Xtr, Xv, ytr, yv = train_test_split(train[feature_names].values, y, test_size=0.2, random_state=1987)
dtrain = xgb.DMatrix(Xtr, label=ytr)
dvalid = xgb.DMatrix(Xv, label=yv)
dtest = xgb.DMatrix(test[feature_names].values)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

xgb_pars = {'min_child_weight': 50, 'eta': 0.3, 'colsample_bytree': 0.3, 'max_depth': 10,
            'subsample': 0.8, 'lambda': 1., 'nthread': -1, 'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:linear'}

# You could try to train with more epoch
model = xgb.train(xgb_pars, dtrain, 15, watchlist, early_stopping_rounds=2,
                  maximize=False, verbose_eval=1)
end = time.time()
print("Time taken by above cell is {}.".format(end - start))
print('Modeling RMSLE %.5f' % model.best_score)
[0]	train-rmse:4.22726	valid-rmse:4.22841
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 2 rounds.
[1]	train-rmse:2.98083	valid-rmse:2.98244
[2]	train-rmse:2.11167	valid-rmse:2.11381
[3]	train-rmse:1.51307	valid-rmse:1.51598
[4]	train-rmse:1.10813	valid-rmse:1.11194
[5]	train-rmse:0.836374	valid-rmse:0.841546
[6]	train-rmse:0.663311	valid-rmse:0.669983
[7]	train-rmse:0.558202	valid-rmse:0.566389
[8]	train-rmse:0.485001	valid-rmse:0.494619
[9]	train-rmse:0.451296	valid-rmse:0.462016
[10]	train-rmse:0.431356	valid-rmse:0.443106
[11]	train-rmse:0.420363	valid-rmse:0.432821
[12]	train-rmse:0.415032	valid-rmse:0.427993
[13]	train-rmse:0.410913	valid-rmse:0.424339
[14]	train-rmse:0.409381	valid-rmse:0.423168
Time taken by above cell is 17.472981691360474.
Modeling RMSLE 0.42317

加入更多特征

天气特征

weather = pd.read_csv('./data/weather_data_nyc_centralpark_2016.csv')
weather.head()
datemaximum temperatureminimum temperatureaverage temperatureprecipitationsnow fallsnow depth
01-1-2016423438.00.000.00
12-1-2016403236.00.000.00
23-1-2016453540.00.000.00
34-1-2016361425.00.000.00
45-1-2016291120.00.000.00
from ggplot import *
weather.date = pd.to_datetime(weather.date)
weather['day_of_year']= weather.date.dt.dayofyear
p = ggplot(aes(x='date'),data=weather) + geom_line(aes(y='minimum temperature', colour = "blue")) + geom_line(aes(y='maximum temperature', colour = "red"))
p + geom_point(aes(y='minimum temperature',colour = "blue")) #+ stat_smooth(colour='yellow', span=0.2)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-egnwFyZL-1633489386760)(output_98_0.png)]


下雪,降雨,积雪情况

import matplotlib.pyplot as plt
%matplotlib inline
weather['precipitation'].unique()
weather['precipitation'] = np.where(weather['precipitation']=='T', '0.00',weather['precipitation'])
weather['precipitation'] = list(map(float, weather['precipitation']))
weather['snow fall'] = np.where(weather['snow fall']=='T', '0.00',weather['snow fall'])
weather['snow fall'] = list(map(float, weather['snow fall']))
weather['snow depth'] = np.where(weather['snow depth']=='T', '0.00',weather['snow depth'])
weather['snow depth'] = list(map(float, weather['snow depth']))
import plotly.plotly as py
import plotly.graph_objs as go
import plotly
random_x = weather['date'].values
random_y0 = weather['precipitation']
random_y1 = weather['snow fall']
random_y2 = weather['snow depth']

import plotly.plotly as py
import plotly.graph_objs as go
import plotly
random_x = weather['date'].values
random_y0 = weather['precipitation']
random_y1 = weather['snow fall']
random_y2 = weather['snow depth']

# Create traces
trace0 = go.Scatter(
    x = random_x,
    y = random_y0,
    mode = 'markers',
    name = 'precipitation'
)
trace1 = go.Scatter(
    x = random_x,
    y = random_y1,
    mode = 'markers',
    name = 'snow fall'
)
trace2 = go.Scatter(
    x = random_x,
    y = random_y2,
    mode = 'markers',
    name = 'snow depth'
)

data = [trace0, trace1, trace2]
plotly.offline.iplot(data, filename='scatter-mode')

动作方向

def freq_turn(step_dir):
    """function to create dummy for turn type"""
    from collections import Counter
    step_dir_new = step_dir.split("|")
    a_list = Counter(step_dir_new).most_common()
    path = {}
    for i in range(len(a_list)):
        path.update({a_list[i]})
    a = 0
    b = 0
    c = 0
    if 'straight' in (path.keys()):
        a = path['straight']
        #print(a)
    if 'left' in (path.keys()):
        b = path['left']
        #print(b)
    if 'right' in (path.keys()):
        c = path['right']
        #print(c)
    return a,b,c
start = time.time()
train_fr['straight']= 0
train_fr['left'] =0
train_fr['right'] = 0
train_fr['straight'], train_fr['left'], train_fr['right'] = zip(*train_fr['step_direction'].map(freq_turn))
end = time.time()
print("Time Taken by above cell is {}.".format(end - start))
Time Taken by above cell is 12.961659669876099.
train_fr_new = train_fr[['id','straight','left','right']]
train = pd.merge(train, train_fr_new, on = 'id', how = 'left')
#train = pd.merge(train, weather, on= 'date', how = 'left')
print(len(train.columns))
#train.columns
47
train.head()
idvendor_idpickup_datetimedropoff_datetimepassenger_countpickup_longitudepickup_latitudedropoff_longitudedropoff_latitudestore_and_fwd_flag...speed_hvsnspeed_manhtnpickup_pca0pickup_pca1dropoff_pca0dropoff_pca1store_and_fwd_flag_intstraightleftright
0id287542122016-03-14 17:24:552016-03-14 17:32:301-73.98215540.767937-73.96463040.765602N...0.0090870.0105240.0076910.017053-0.0096660.01369502.01.01.0
1id237739412016-06-12 00:43:352016-06-12 00:54:381-73.98041540.738564-73.99948140.731152N...0.0054380.0073210.007677-0.0123710.027145-0.01865200.02.02.0
2id385852922016-01-19 11:35:242016-01-19 12:10:481-73.97902740.763939-74.00533340.710087N...0.0083180.0106870.0048030.0128790.034222-0.03933703.04.05.0
3id350467322016-04-06 19:32:312016-04-06 19:39:401-74.01004040.719971-74.01226840.706718N...0.0063000.0070460.038342-0.0291940.041343-0.04229300.02.01.0
4id218102822016-03-26 13:30:552016-03-26 13:38:101-73.97305340.793209-73.97292340.782520N...0.0084840.008561-0.0028770.041749-0.0023800.03107100.02.02.0

5 rows × 47 columns

加入天气特征

train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
train['date'] = train['pickup_datetime'].dt.date
train.head()

train['date'] = pd.to_datetime(train['date'])
train = pd.merge(train, weather[['date','minimum temperature', 'precipitation', 'snow fall', 'snow depth']], on= 'date', how = 'left')
train.shape[0]
1458644
train.loc[:,'hvsine_pick_cent_d'] = haversine_(train['pickup_latitude'].values, train['pickup_longitude'].values, train['centroid_drop_lat'].values, train['centroid_drop_long'].values)
train.loc[:,'hvsine_drop_cent_p'] = haversine_(train['dropoff_latitude'].values, train['dropoff_longitude'].values, train['centroid_pick_lat'].values, train['centroid_pick_long'].values)

test.loc[:,'hvsine_pick_cent_d'] = haversine_(test['pickup_latitude'].values, test['pickup_longitude'].values, test['centroid_drop_lat'].values, test['centroid_drop_long'].values)
test.loc[:,'hvsine_drop_cent_p'] = haversine_(test['dropoff_latitude'].values, test['dropoff_longitude'].values, test['centroid_pick_lat'].values, test['centroid_pick_long'].values)

print("shape of train_features is {}.".format(len(train.columns)))
shape of train_features is 54.

测试集才用相同的特征

start = time.time()
test_fr['straight']= 0
test_fr['left'] =0
test_fr['right'] = 0
test_fr['straight'], test_fr['left'], test_fr['right'] = zip(*test_fr['step_direction'].map(freq_turn))
end = time.time()
print("Time Taken by above cell is {}.".format(end - start))
#test_fr.head()
Time Taken by above cell is 5.300434827804565.
test_fr_new = test_fr[['id','straight','left','right']]
test = pd.merge(test, test_fr_new, on = 'id', how = 'left')
print(len(test.columns))
#test.columns
47
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])
test['date'] = test['pickup_datetime'].dt.date
test['date'] = pd.to_datetime(test['date'])
test= pd.merge(test, weather[['date','minimum temperature', 'precipitation', 'snow fall', 'snow depth']], on= 'date', how = 'left')
feature_names = list(train.columns)
print("Difference of features in train and test are {}".format(np.setdiff1d(train.columns, test.columns)))
print("")
do_not_use_for_training = ['pick_date','id', 'pickup_datetime', 'dropoff_datetime', 'trip_duration', 'store_and_fwd_flag', 'date']
feature_names = [f for f in train.columns if f not in do_not_use_for_training]
print("We will be using following features for training {}.".format(feature_names))
print("")
print("Total number of features are {}.".format(len(feature_names)))
Difference of features in train and test are ['dropoff_datetime' 'trip_duration']

We will be using following features for training ['vendor_id', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'total_distance', 'total_travel_time', 'number_of_steps', 'pick_month', 'hour', 'week_of_year', 'day_of_year', 'day_of_week', 'hvsine_pick_drop', 'manhtn_pick_drop', 'bearing', 'label_pick', 'label_drop', 'centroid_pick_long', 'centroid_pick_lat', 'centroid_drop_long', 'centroid_drop_lat', 'hvsine_pick_cent_p', 'hvsine_drop_cent_d', 'hvsine_cent_p_cent_d', 'manhtn_pick_cent_p', 'manhtn_drop_cent_d', 'manhtn_cent_p_cent_d', 'bearing_pick_cent_p', 'bearing_drop_cent_p', 'bearing_cent_p_cent_d', 'speed_hvsn', 'speed_manhtn', 'pickup_pca0', 'pickup_pca1', 'dropoff_pca0', 'dropoff_pca1', 'store_and_fwd_flag_int', 'straight', 'left', 'right', 'minimum temperature', 'precipitation', 'snow fall', 'snow depth', 'hvsine_pick_cent_d', 'hvsine_drop_cent_p'].

Total number of features are 48.
y = np.log(train['trip_duration'].values + 1)

再次训练模型

Xtr, Xv, ytr, yv = train_test_split(train[feature_names].values, y, test_size=0.2, random_state=1987)
dtrain = xgb.DMatrix(Xtr, label=ytr)
dvalid = xgb.DMatrix(Xv, label=yv)
dtest = xgb.DMatrix(test[feature_names].values)
watchlist = [(dtrain, 'train'), (dvalid, 'valid')]

start = time.time()
xgb_pars = {'min_child_weight': 50, 'eta': 0.3, 'colsample_bytree': 0.3, 'max_depth': 10,
            'subsample': 0.8, 'lambda': 1., 'nthread': -1, 'booster' : 'gbtree', 'silent': 1,
            'eval_metric': 'rmse', 'objective': 'reg:linear'}

model_1 = xgb.train(xgb_par, dtrain, 100, watchlist, early_stopping_rounds=4, maximize=False, verbose_eval=1)
print('Modeling RMSLE %.5f' % model.best_score)
end = time.time()
print("Time taken in training is {}.".format(end - start))

[0]	train-rmse:5.72042	valid-rmse:5.72132
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 4 rounds.
[1]	train-rmse:5.43622	valid-rmse:5.43719
[2]	train-rmse:5.16677	valid-rmse:5.16779
[3]	train-rmse:4.91052	valid-rmse:4.91162
[4]	train-rmse:4.6672	valid-rmse:4.66837
[5]	train-rmse:4.43612	valid-rmse:4.43735
[6]	train-rmse:4.21655	valid-rmse:4.21782
[7]	train-rmse:4.00819	valid-rmse:4.00953
[8]	train-rmse:3.81012	valid-rmse:3.81151
[9]	train-rmse:3.62209	valid-rmse:3.62354
[10]	train-rmse:3.44373	valid-rmse:3.44527
[11]	train-rmse:3.2744	valid-rmse:3.27601
[12]	train-rmse:3.11365	valid-rmse:3.11535
[13]	train-rmse:2.9613	valid-rmse:2.96308
[14]	train-rmse:2.81629	valid-rmse:2.81817
[15]	train-rmse:2.67887	valid-rmse:2.68086
[16]	train-rmse:2.54841	valid-rmse:2.55053
[17]	train-rmse:2.42448	valid-rmse:2.42672
[18]	train-rmse:2.30707	valid-rmse:2.30944
[19]	train-rmse:2.19537	valid-rmse:2.19787
[20]	train-rmse:2.08975	valid-rmse:2.09242
[21]	train-rmse:1.98954	valid-rmse:1.99237
[22]	train-rmse:1.89417	valid-rmse:1.89717
[23]	train-rmse:1.80406	valid-rmse:1.80727
[24]	train-rmse:1.71832	valid-rmse:1.72173
[25]	train-rmse:1.63703	valid-rmse:1.64065
[26]	train-rmse:1.56005	valid-rmse:1.56394
[27]	train-rmse:1.48717	valid-rmse:1.49133
[28]	train-rmse:1.41816	valid-rmse:1.42261
[29]	train-rmse:1.35304	valid-rmse:1.35777
[30]	train-rmse:1.29104	valid-rmse:1.29611
[31]	train-rmse:1.23239	valid-rmse:1.2378
[32]	train-rmse:1.17727	valid-rmse:1.18303
[33]	train-rmse:1.12499	valid-rmse:1.13116
[34]	train-rmse:1.0754	valid-rmse:1.08201
[35]	train-rmse:1.02851	valid-rmse:1.03561
[36]	train-rmse:0.984373	valid-rmse:0.991948
[37]	train-rmse:0.942633	valid-rmse:0.950732
[38]	train-rmse:0.903371	valid-rmse:0.912026
[39]	train-rmse:0.866419	valid-rmse:0.875636
[40]	train-rmse:0.831615	valid-rmse:0.841465
[41]	train-rmse:0.798508	valid-rmse:0.809044
[42]	train-rmse:0.76716	valid-rmse:0.778418
[43]	train-rmse:0.738213	valid-rmse:0.750092
[44]	train-rmse:0.710718	valid-rmse:0.723319
[45]	train-rmse:0.684879	valid-rmse:0.698242
[46]	train-rmse:0.660684	valid-rmse:0.674864
[47]	train-rmse:0.637616	valid-rmse:0.6527
[48]	train-rmse:0.615922	valid-rmse:0.63198
[49]	train-rmse:0.595885	valid-rmse:0.612811
[50]	train-rmse:0.577099	valid-rmse:0.59497
[51]	train-rmse:0.559619	valid-rmse:0.57841
[52]	train-rmse:0.54312	valid-rmse:0.562796
[53]	train-rmse:0.527632	valid-rmse:0.548306
[54]	train-rmse:0.513364	valid-rmse:0.534999
[55]	train-rmse:0.499734	valid-rmse:0.522428
[56]	train-rmse:0.487051	valid-rmse:0.510792
[57]	train-rmse:0.475515	valid-rmse:0.500199
[58]	train-rmse:0.465022	valid-rmse:0.490585
[59]	train-rmse:0.454814	valid-rmse:0.481421
[60]	train-rmse:0.445312	valid-rmse:0.472921
[61]	train-rmse:0.436683	valid-rmse:0.465274
[62]	train-rmse:0.428942	valid-rmse:0.458362
[63]	train-rmse:0.421499	valid-rmse:0.451837
[64]	train-rmse:0.414361	valid-rmse:0.445674
[65]	train-rmse:0.407798	valid-rmse:0.440093
[66]	train-rmse:0.401684	valid-rmse:0.43488
[67]	train-rmse:0.396445	valid-rmse:0.430372
[68]	train-rmse:0.391588	valid-rmse:0.426227
[69]	train-rmse:0.386804	valid-rmse:0.422285
[70]	train-rmse:0.382344	valid-rmse:0.418661
[71]	train-rmse:0.378198	valid-rmse:0.415358
[72]	train-rmse:0.374537	valid-rmse:0.412416
[73]	train-rmse:0.371061	valid-rmse:0.409669
[74]	train-rmse:0.367815	valid-rmse:0.407142
[75]	train-rmse:0.365014	valid-rmse:0.404898
[76]	train-rmse:0.362352	valid-rmse:0.402853
[77]	train-rmse:0.359678	valid-rmse:0.400882
[78]	train-rmse:0.357404	valid-rmse:0.399173
[79]	train-rmse:0.355237	valid-rmse:0.397581
[80]	train-rmse:0.353313	valid-rmse:0.396152
[81]	train-rmse:0.351466	valid-rmse:0.394852
[82]	train-rmse:0.349827	valid-rmse:0.393688
[83]	train-rmse:0.348238	valid-rmse:0.392624
[84]	train-rmse:0.346586	valid-rmse:0.391577
[85]	train-rmse:0.344865	valid-rmse:0.390459
[86]	train-rmse:0.343565	valid-rmse:0.38962
[87]	train-rmse:0.342047	valid-rmse:0.388682
[88]	train-rmse:0.340773	valid-rmse:0.387944
[89]	train-rmse:0.339611	valid-rmse:0.387237
[90]	train-rmse:0.338232	valid-rmse:0.386392
[91]	train-rmse:0.337017	valid-rmse:0.38571
[92]	train-rmse:0.33599	valid-rmse:0.385164
[93]	train-rmse:0.334952	valid-rmse:0.384605
[94]	train-rmse:0.333857	valid-rmse:0.384042
[95]	train-rmse:0.332787	valid-rmse:0.383526
[96]	train-rmse:0.332035	valid-rmse:0.383221
[97]	train-rmse:0.331577	valid-rmse:0.38295
[98]	train-rmse:0.330563	valid-rmse:0.382527
[99]	train-rmse:0.329945	valid-rmse:0.382268
Modeling RMSLE 0.42317
Time taken in training is 184.5209550857544.
print('Modeling RMSLE %.5f' % model_1.best_score)
end = time.time()
print("Time taken in training is {}.".format(end - start))
start = time.time()
yvalid = model_1.predict(dvalid)
ytest = model_1.predict(dtest)
end = time.time()
print("Time taken in prediction is {}.".format(end - start))
Modeling RMSLE 3.62354
Time taken in training is 16.804673671722412.
Time taken in prediction is 0.07018685340881348.
start = time.time()
if test.shape[0] == ytest.shape[0]:
    print('Test shape OK.') 
test['trip_duration'] = np.exp(ytest) - 1
test[['id', 'trip_duration']].to_csv('mahesh_xgb_submission.csv', index=False)
end = time.time()
print("Time taken in training is {}.".format(end - start))
Test shape OK.
Time taken in training is 1.3792648315429688.
转载请注明:文章转载自 www.mshxw.com
本文地址:https://www.mshxw.com/it/295929.html
我们一直用心在做
关于我们 文章归档 网站地图 联系我们

版权所有 (c)2021-2022 MSHXW.COM

ICP备案号:晋ICP备2021003244-6号