携程客户流失分析项目（个人练习+源代码）

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
import datetime
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import f_classif, SelectFromModel, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier as RFC
%matplotlib inline

plt.rcParams['font.family'] = ['SimHei']    # 显示中文,解决图中无法显示中文的问题
plt.rcParams['axes.unicode_minus']=False

# 读取文件
df = pd.read_table('userlostprob.txt')

# 查看头五行
df.head()

	label	sampleid	d	arrival	decisionhabit_user	historyvisit_7ordernum	historyvisit_totalordernum	hotelcr	ordercanceledprecent	...	lowestprice_pre2	lasthtlordergap	businessrate_pre2	cityuvs	cityorders	lastpvgap	cr	sid	visitnum_oneyear	h
0	0	24636	2016-05-18	2016-05-18	NaN	NaN	NaN	1.04	NaN	...	615.0	NaN	0.29	12.880	3.147	NaN	NaN	7	NaN	12
1	1	24637	2016-05-18	2016-05-18	NaN	NaN	NaN	1.06	NaN	...	513.0	NaN	0.53	17.933	4.913	NaN	NaN	33	NaN	14
2	0	24641	2016-05-18	2016-05-19	NaN	NaN	NaN	1.05	NaN	...	382.0	NaN	0.60	3.993	0.760	NaN	NaN	10	NaN	19
3	0	24642	2016-05-18	2016-05-18	NaN	NaN	NaN	1.01	NaN	...	203.0	NaN	0.18	3.220	0.660	NaN	NaN	8	NaN	16
4	1	24644	2016-05-18	2016-05-19	NaN	NaN	NaN	1.00	NaN	...	84.0	NaN	NaN	0.013	NaN	NaN	NaN	1	NaN	21

5 rows × 51 columns

# 观察标签分布状况
df['label'].value_counts()

0    500588
1    189357
Name: label, dtype: int64

# 查看后五行
df.tail()

	label	sampleid	d	arrival	iforderpv_24h	decisionhabit_user	historyvisit_7ordernum	historyvisit_totalordernum	hotelcr	ordercanceledprecent	...	lowestprice_pre2	lasthtlordergap	businessrate_pre2	cityuvs	cityorders	lastpvgap	cr	sid	visitnum_oneyear	h
689940	1	2238419	2016-05-15	2016-05-17	1	19.0	NaN	NaN	1.06	NaN	...	406.0	NaN	0.48	13.573	1.660	1034.0	1.0	5	119.0	18
689941	1	2238421	2016-05-15	2016-05-15	1	10.0	3.0	3.0	1.06	0.33	...	199.0	713.0	0.51	2.880	0.513	179.0	2.0	15	1472.0	12
689942	0	2238422	2016-05-15	2016-05-17	0	NaN	NaN	NaN	1.07	NaN	...	544.0	NaN	0.45	15.293	2.067	0.0	NaN	8	107.0	0
689943	0	2238425	2016-05-15	2016-05-17	0	NaN	NaN	NaN	1.04	NaN	...	156.0	NaN	0.29	2.467	0.333	NaN	NaN	4	NaN	0
689944	0	2238426	2016-05-15	2016-05-15	0	NaN	NaN	NaN	1.02	NaN	...	275.0	NaN	NaN	12.600	2.653	NaN	NaN	2	NaN	11

5 rows × 51 columns

# 随机查看五行
df.sample(5)

	label	sampleid	d	arrival	decisionhabit_user	historyvisit_7ordernum	historyvisit_totalordernum	hotelcr	ordercanceledprecent	...	lowestprice_pre2	lasthtlordergap	businessrate_pre2	cityuvs	cityorders	lastpvgap	cr	sid	visitnum_oneyear	h
477013	1	820235	2016-05-21	2016-05-21	15.0	NaN	15.0	1.05	0.36	...	582.0	18831.0	0.48	17.220	3.400	4242.0	1.33	446	906.0	9
426926	0	736598	2016-05-15	2016-05-15	1.0	NaN	39.0	1.05	0.16	...	978.0	12199.0	0.13	5.113	0.847	642.0	1.36	732	2583.0	8
628554	0	1072402	2016-05-20	2016-05-20	NaN	NaN	3.0	1.02	0.00	...	147.0	55214.0	0.27	15.873	3.220	10002.0	1.11	186	905.0	19
248275	0	438633	2016-05-18	2016-06-09	19.0	2.0	28.0	1.02	0.78	...	NaN	3329.0	NaN	1.320	0.087	145.0	1.12	449	17397.0	11
198972	0	356550	2016-05-19	2016-05-19	7.0	NaN	2.0	1.04	0.50	...	206.0	61467.0	0.32	20.480	5.153	13264.0	1.08	59	1522.0	20

5 rows × 51 columns

# 数据形状
df.shape

(689945, 51)

# 查看数据类型
df.dtypes

label                                 int64
sampleid                              int64
d                                    object
arrival                              object
iforderpv_24h                         int64
decisionhabit_user                  float64
historyvisit_7ordernum              float64
historyvisit_totalordernum          float64
hotelcr                             float64
ordercanceledprecent                float64
landhalfhours                       float64
ordercanncelednum                   float64
commentnums                         float64
starprefer                          float64
novoters                            float64
consuming_capacity                  float64
historyvisit_avghotelnum            float64
cancelrate                          float64
historyvisit_visit_detailpagenum    float64
delta_price1                        float64
price_sensitive                     float64
hoteluv                             float64
businessrate_pre                    float64
ordernum_oneyear                    float64
cr_pre                              float64
avgprice                            float64
lowestprice                         float64
firstorder_bu                       float64
customereval_pre2                   float64
delta_price2                        float64
commentnums_pre                     float64
customer_value_profit               float64
commentnums_pre2                    float64
cancelrate_pre                      float64
novoters_pre2                       float64
novoters_pre                        float64
ctrip_profits                       float64
deltaprice_pre2_t1                  float64
lowestprice_pre                     float64
uv_pre                              float64
uv_pre2                             float64
lowestprice_pre2                    float64
lasthtlordergap                     float64
businessrate_pre2                   float64
cityuvs                             float64
cityorders                          float64
lastpvgap                           float64
cr                                  float64
sid                                   int64
visitnum_oneyear                    float64
h                                     int64
dtype: object

# 查看数据基本信息
df.info()


RangeIndex: 689945 entries, 0 to 689944
Data columns (total 51 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   label                             689945 non-null  int64  
 1   sampleid                          689945 non-null  int64  
 2   d                                 689945 non-null  object 
 3   arrival                           689945 non-null  object 
 4   iforderpv_24h                     689945 non-null  int64  
 5   decisionhabit_user                385450 non-null  float64
 6   historyvisit_7ordernum            82915 non-null   float64
 7   historyvisit_totalordernum        386525 non-null  float64
 8   hotelcr                           689148 non-null  float64
 9   ordercanceledprecent              447831 non-null  float64
 10  landhalfhours                     661312 non-null  float64
 11  ordercanncelednum                 447831 non-null  float64
 12  commentnums                       622029 non-null  float64
 13  starprefer                        464892 non-null  float64
 14  novoters                          672918 non-null  float64
 15  consuming_capacity                463837 non-null  float64
 16  historyvisit_avghotelnum          387876 non-null  float64
 17  cancelrate                        678227 non-null  float64
 18  historyvisit_visit_detailpagenum  307234 non-null  float64
 19  delta_price1                      437146 non-null  float64
 20  price_sensitive                   463837 non-null  float64
 21  hoteluv                           689148 non-null  float64
 22  businessrate_pre                  483896 non-null  float64
 23  ordernum_oneyear                  447831 non-null  float64
 24  cr_pre                            660548 non-null  float64
 25  avgprice                          457261 non-null  float64
 26  lowestprice                       687931 non-null  float64
 27  firstorder_bu                     376993 non-null  float64
 28  customereval_pre2                 661312 non-null  float64
 29  delta_price2                      437750 non-null  float64
 30  commentnums_pre                   598368 non-null  float64
 31  customer_value_profit             439123 non-null  float64
 32  commentnums_pre2                  648457 non-null  float64
 33  cancelrate_pre                    653015 non-null  float64
 34  novoters_pre2                     657616 non-null  float64
 35  novoters_pre                      648956 non-null  float64
 36  ctrip_profits                     445187 non-null  float64
 37  deltaprice_pre2_t1                543180 non-null  float64
 38  lowestprice_pre                   659689 non-null  float64
 39  uv_pre                            660548 non-null  float64
 40  uv_pre2                           661189 non-null  float64
 41  lowestprice_pre2                  660664 non-null  float64
 42  lasthtlordergap                   447831 non-null  float64
 43  businessrate_pre2                 602960 non-null  float64
 44  cityuvs                           682274 non-null  float64
 45  cityorders                        651263 non-null  float64
 46  lastpvgap                         592818 non-null  float64
 47  cr                                457896 non-null  float64
 48  sid                               689945 non-null  int64  
 49  visitnum_oneyear                  592910 non-null  float64
 50  h                                 689945 non-null  int64  
dtypes: float64(44), int64(5), object(2)
memory usage: 268.5+ MB

# 描述性统计
df.describe([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99])

	label	sampleid	iforderpv_24h	decisionhabit_user	historyvisit_7ordernum	historyvisit_totalordernum	hotelcr	ordercanceledprecent	landhalfhours	ordercanncelednum	...	lowestprice_pre2	lasthtlordergap	businessrate_pre2	cityuvs	cityorders	lastpvgap	cr	sid	visitnum_oneyear	h
count	689945.000000	6.899450e+05	689945.000000	385450.000000	82915.000000	386525.000000	689148.000000	447831.000000	661312.000000	447831.000000	...	660664.000000	447831.000000	602960.000000	682274.000000	651263.000000	592818.000000	457896.000000	689945.000000	5.929100e+05	689945.000000
mean	0.274452	6.285402e+05	0.193737	5.317048	1.856094	11.710487	1.060996	0.342119	6.086366	154.179369	...	318.541812	101830.919400	0.368237	10.648278	2.253250	12049.409382	1.137476	153.702414	1.855185e+04	14.462315
std	0.446238	4.146815e+05	0.395226	38.524483	2.103862	17.251429	0.045264	0.354210	12.413225	398.456986	...	351.913035	122784.313864	0.219945	15.696682	3.538453	25601.374138	0.204789	277.807697	2.288603e+05	6.301575
min	0.000000	2.463600e+04	0.000000	0.000000	1.000000	1.000000	1.000000	0.000000	0.000000	0.000000	...	1.000000	0.000000	0.000000	0.007000	0.007000	0.000000	1.000000	0.000000	1.000000e+00	0.000000
1%	0.000000	3.620588e+04	0.000000	1.000000	1.000000	1.000000	1.000000	0.000000	0.000000	0.000000	...	52.000000	244.000000	0.010000	0.013000	0.007000	0.000000	1.000000	1.000000	2.100000e+01	0.000000
10%	0.000000	1.398464e+05	0.000000	1.000000	1.000000	1.000000	1.010000	0.000000	0.000000	0.000000	...	101.000000	3518.000000	0.050000	0.160000	0.033000	127.000000	1.000000	4.000000	1.610000e+02	6.000000
25%	0.000000	3.123200e+05	0.000000	2.000000	1.000000	2.000000	1.030000	0.000000	0.000000	0.000000	...	145.000000	14999.000000	0.170000	0.827000	0.127000	551.000000	1.000000	17.000000	4.710000e+02	11.000000
50%	0.000000	5.996370e+05	0.000000	3.000000	1.000000	6.000000	1.050000	0.250000	0.000000	2.000000	...	233.000000	46890.000000	0.400000	3.527000	0.627000	2848.000000	1.050000	62.000000	1.315000e+03	15.000000
75%	1.000000	8.874600e+05	0.000000	5.000000	2.000000	14.000000	1.090000	0.570000	4.000000	153.000000	...	388.000000	138953.000000	0.550000	13.327000	2.747000	10726.000000	1.210000	180.000000	3.141000e+03	20.000000
90%	1.000000	1.059705e+06	1.000000	10.000000	3.000000	29.000000	1.120000	0.980000	27.000000	492.000000	...	611.000000	311492.000000	0.650000	35.567000	7.547000	30384.900000	1.400000	392.000000	6.634000e+03	22.000000
99%	1.000000	2.226893e+06	1.000000	27.000000	7.000000	82.000000	1.190000	1.000000	48.000000	1752.000000	...	1464.000000	484734.000000	0.780000	66.007000	14.453000	138722.000000	2.000000	1212.000000	2.625670e+05	23.000000
max	1.000000	2.238426e+06	1.000000	3167.000000	106.000000	711.000000	3.180000	1.000000	49.000000	13475.000000	...	43700.000000	527026.000000	0.990000	67.140000	14.507000	194386.000000	11.000000	9956.000000	9.651192e+06	23.000000

12 rows × 49 columns

# 删除重复值
df.drop_duplicates(inplace=True)
df.shape

(689945, 51)

# 根据缺失值比例进行排序
null = df.isnull().mean().reset_index().sort_values(0)
null_1 = null.rename(columns={'index':'特征', 0:'缺失比'})
null_1

	特征	缺失比
0	label	0.000000
48	sid	0.000000
4	iforderpv_24h	0.000000
50	h	0.000000
2	d	0.000000
1	sampleid	0.000000
3	arrival	0.000000
8	hotelcr	0.001155
21	hoteluv	0.001155
26	lowestprice	0.002919
44	cityuvs	0.011118
17	cancelrate	0.016984
14	novoters	0.024679
28	customereval_pre2	0.041500
10	landhalfhours	0.041500
40	uv_pre2	0.041679
41	lowestprice_pre2	0.042440
39	uv_pre	0.042608
24	cr_pre	0.042608
38	lowestprice_pre	0.043853
34	novoters_pre2	0.046857
33	cancelrate_pre	0.053526
45	cityorders	0.056065
35	novoters_pre	0.059409
32	commentnums_pre2	0.060132
12	commentnums	0.098437
43	businessrate_pre2	0.126075
30	commentnums_pre	0.132731
49	visitnum_oneyear	0.140642
46	lastpvgap	0.140775
37	deltaprice_pre2_t1	0.212720
22	businessrate_pre	0.298646
13	starprefer	0.326190
20	price_sensitive	0.327719
15	consuming_capacity	0.327719
47	cr	0.336330
25	avgprice	0.337250
23	ordernum_oneyear	0.350918
42	lasthtlordergap	0.350918
11	ordercanncelednum	0.350918
9	ordercanceledprecent	0.350918
36	ctrip_profits	0.354750
31	customer_value_profit	0.363539
29	delta_price2	0.365529
19	delta_price1	0.366405
16	historyvisit_avghotelnum	0.437816
7	historyvisit_totalordernum	0.439774
5	decisionhabit_user	0.441332
27	firstorder_bu	0.453590
18	historyvisit_visit_detailpagenum	0.554698
6	historyvisit_7ordernum	0.879824

# 绘制密度图
plt.figure(figsize=(8,6))
sns.kdeplot(null_1['缺失比'], shade=True)

# 用条形图观察缺失值
plt.figure(figsize=(8,6))
plt.bar(range(null_1.shape[0]), null_1['缺失比'], label='lost rate')
plt.legend(loc='best')

# 删除缺失值过多的列
df = df.drop(['historyvisit_7ordernum'], axis=1)
df

	label	sampleid	d	arrival	iforderpv_24h	decisionhabit_user	historyvisit_totalordernum	hotelcr	ordercanceledprecent	landhalfhours	...	lowestprice_pre2	lasthtlordergap	businessrate_pre2	cityuvs	cityorders	lastpvgap	cr	sid	visitnum_oneyear	h
0	0	24636	2016-05-18	2016-05-18	0	NaN	NaN	1.04	NaN	22.0	...	615.0	NaN	0.29	12.880	3.147	NaN	NaN	7	NaN	12
1	1	24637	2016-05-18	2016-05-18	0	NaN	NaN	1.06	NaN	0.0	...	513.0	NaN	0.53	17.933	4.913	NaN	NaN	33	NaN	14
2	0	24641	2016-05-18	2016-05-19	0	NaN	NaN	1.05	NaN	3.0	...	382.0	NaN	0.60	3.993	0.760	NaN	NaN	10	NaN	19
3	0	24642	2016-05-18	2016-05-18	0	NaN	NaN	1.01	NaN	2.0	...	203.0	NaN	0.18	3.220	0.660	NaN	NaN	8	NaN	16
4	1	24644	2016-05-18	2016-05-19	0	NaN	NaN	1.00	NaN	0.0	...	84.0	NaN	NaN	0.013	NaN	NaN	NaN	1	NaN	21
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
689940	1	2238419	2016-05-15	2016-05-17	1	19.0	NaN	1.06	NaN	1.0	...	406.0	NaN	0.48	13.573	1.660	1034.0	1.0	5	119.0	18
689941	1	2238421	2016-05-15	2016-05-15	1	10.0	3.0	1.06	0.33	49.0	...	199.0	713.0	0.51	2.880	0.513	179.0	2.0	15	1472.0	12
689942	0	2238422	2016-05-15	2016-05-17	0	NaN	NaN	1.07	NaN	0.0	...	544.0	NaN	0.45	15.293	2.067	0.0	NaN	8	107.0	0
689943	0	2238425	2016-05-15	2016-05-17	0	NaN	NaN	1.04	NaN	0.0	...	156.0	NaN	0.29	2.467	0.333	NaN	NaN	4	NaN	0
689944	0	2238426	2016-05-15	2016-05-15	0	NaN	NaN	1.02	NaN	0.0	...	275.0	NaN	NaN	12.600	2.653	NaN	NaN	2	NaN	11

689945 rows × 50 columns

# 异常值观察
df.describe([0.01, 0.25, 0.5, 0.75, 0.99]).T

	count	mean	std	min	1%	25%	50%	75%	99%	max
label	689945.0	0.274452	0.446238	0.000	0.00000	0.000	0.000	1.000	1.000000e+00	1.000
sampleid	689945.0	628540.209625	414681.498697	24636.000	36205.88000	312320.000	599637.000	887460.000	2.226893e+06	2238426.000
iforderpv_24h	689945.0	0.193737	0.395226	0.000	0.00000	0.000	0.000	0.000	1.000000e+00	1.000
decisionhabit_user	385450.0	5.317048	38.524483	0.000	1.00000	2.000	3.000	5.000	2.700000e+01	3167.000
historyvisit_totalordernum	386525.0	11.710487	17.251429	1.000	1.00000	2.000	6.000	14.000	8.200000e+01	711.000
hotelcr	689148.0	1.060996	0.045264	1.000	1.00000	1.030	1.050	1.090	1.190000e+00	3.180
ordercanceledprecent	447831.0	0.342119	0.354210	0.000	0.00000	0.000	0.250	0.570	1.000000e+00	1.000
landhalfhours	661312.0	6.086366	12.413225	0.000	0.00000	0.000	0.000	4.000	4.800000e+01	49.000
ordercanncelednum	447831.0	154.179369	398.456986	0.000	0.00000	0.000	2.000	153.000	1.752000e+03	13475.000
commentnums	622029.0	1272.090888	2101.871601	0.000	1.00000	115.000	514.000	1670.000	8.796000e+03	34189.000
starprefer	464892.0	67.532304	19.175094	0.000	20.00000	53.300	69.400	80.300	1.000000e+02	100.000
novoters	672918.0	1706.247901	2811.690007	1.000	1.00000	157.000	692.000	2196.000	1.157600e+04	45455.000
consuming_capacity	463837.0	39.154140	23.240147	0.000	8.00000	22.000	33.000	51.000	1.000000e+02	100.000
historyvisit_avghotelnum	387876.0	6.510179	41.045261	0.000	1.00000	2.000	4.000	7.000	2.900000e+01	3167.000
cancelrate	678227.0	1051.604143	1509.066134	1.000	2.00000	137.000	503.000	1373.000	6.399000e+03	18930.000
historyvisit_visit_detailpagenum	307234.0	37.153603	73.402891	1.000	1.00000	6.000	18.000	44.000	2.620000e+02	6199.000
delta_price1	437146.0	79.067012	512.942824	-99879.000	-1227.55000	-31.000	81.000	226.000	1.081000e+03	5398.000
price_sensitive	463837.0	24.645863	26.685606	0.000	0.00000	5.000	16.000	33.000	1.000000e+02	100.000
hoteluv	689148.0	95.092708	169.981527	0.007	0.16700	10.427	36.180	107.747	9.641130e+02	1722.613
businessrate_pre	483896.0	0.372717	0.232791	0.000	0.01000	0.150	0.390	0.570	8.000000e-01	0.990
ordernum_oneyear	447831.0	11.642061	17.137209	1.000	1.00000	2.000	6.000	14.000	8.100000e+01	711.000
cr_pre	660548.0	1.062906	0.044588	1.000	1.00000	1.030	1.060	1.090	1.190000e+00	2.950
avgprice	457261.0	422.458701	290.853332	1.000	91.00000	232.000	350.000	524.000	1.491000e+03	6383.000
lowestprice	687931.0	318.806242	575.782415	-3.000	37.00000	116.000	200.000	380.000	1.823000e+03	100000.000
firstorder_bu	376993.0	11.697795	2.746821	1.000	3.00000	12.000	13.000	13.000	1.700000e+01	21.000
customereval_pre2	661312.0	3.048519	1.226635	0.000	0.00000	2.000	3.000	4.000	5.500000e+00	6.000
delta_price2	437750.0	77.277208	391.413839	-43344.000	-949.00000	-29.000	69.000	198.000	1.018000e+03	5114.000
commentnums_pre	598368.0	1415.159561	2329.418922	0.000	1.00000	137.000	592.000	1862.000	9.732000e+03	34189.000
customer_value_profit	439123.0	3.038409	6.625281	-24.075	-0.29678	0.269	0.991	3.138	2.845100e+01	598.064
commentnums_pre2	648457.0	1313.388737	1719.513354	0.000	3.00000	270.000	768.000	1780.000	7.457000e+03	34189.000
cancelrate_pre	653015.0	0.344422	0.179147	0.000	0.05000	0.230	0.320	0.420	1.000000e+00	1.000
novoters_pre2	657616.0	1787.197614	2316.712985	1.000	5.00000	391.000	1054.000	2413.000	1.001800e+04	45436.000
novoters_pre	648956.0	1890.698450	3116.120062	1.000	2.00000	187.000	783.000	2453.000	1.383900e+04	45436.000
ctrip_profits	445187.0	4.208495	9.314438	-44.313	-0.39300	0.340	1.347	4.320	4.075580e+01	600.820
deltaprice_pre2_t1	543180.0	3.283740	48.805880	-2296.000	-103.00000	-3.000	2.000	10.000	1.110000e+02	3324.000
lowestprice_pre	659689.0	315.954583	463.723643	1.000	38.00000	118.000	208.000	385.000	1.750000e+03	100000.000
uv_pre	660548.0	107.846076	186.731907	0.007	0.24000	12.533	42.500	124.707	1.047787e+03	1722.613
uv_pre2	661189.0	103.352990	157.117863	0.007	0.50000	17.563	51.287	126.200	8.567254e+02	1722.613
lowestprice_pre2	660664.0	318.541812	351.913035	1.000	52.00000	145.000	233.000	388.000	1.464000e+03	43700.000
lasthtlordergap	447831.0	101830.919400	122784.313864	0.000	244.00000	14999.000	46890.000	138953.000	4.847340e+05	527026.000
businessrate_pre2	602960.0	0.368237	0.219945	0.000	0.01000	0.170	0.400	0.550	7.800000e-01	0.990
cityuvs	682274.0	10.648278	15.696682	0.007	0.01300	0.827	3.527	13.327	6.600700e+01	67.140
cityorders	651263.0	2.253250	3.538453	0.007	0.00700	0.127	0.627	2.747	1.445300e+01	14.507
lastpvgap	592818.0	12049.409382	25601.374138	0.000	0.00000	551.000	2848.000	10726.000	1.387220e+05	194386.000
cr	457896.0	1.137476	0.204789	1.000	1.00000	1.000	1.050	1.210	2.000000e+00	11.000
sid	689945.0	153.702414	277.807697	0.000	1.00000	17.000	62.000	180.000	1.212000e+03	9956.000
visitnum_oneyear	592910.0	18551.846682	228860.311117	1.000	21.00000	471.000	1315.000	3141.000	2.625670e+05	9651192.000
h	689945.0	14.462315	6.301575	0.000	0.00000	11.000	15.000	20.000	2.300000e+01	23.000

# 查看异常值的列，这里出现了负数价格，以及出现过高的价格
df[['lowestprice_pre', 'lowestprice']].describe([0.01, 0.25, 0.5, 0.75, 0.99]).T

	count	mean	std	min	1%	25%	50%	75%	99%	max
lowestprice_pre	659689.0	315.954583	463.723643	1.0	38.0	118.0	208.0	385.0	1750.0	100000.0
lowestprice	687931.0	318.806242	575.782415	-3.0	37.0	116.0	200.0	380.0	1823.0	100000.0

# 存储异常值的列
col_block = ['lowestprice_pre', 'lowestprice']

# 定义盖帽法函数，去除异常值
def block_upper(x):
    upper = x.quantile(0.99)
    out = x.mask(x > upper, upper)
    return out

def block_lower(x):
    lower = x.quantile(0.01)
    out = x.mask(x < lower, lower)
    return out

# 处理异常值
df[col_block] = df[col_block].apply(block_upper)

df[col_block] = df[col_block].apply(block_lower)

df[['lowestprice_pre', 'lowestprice']].describe([0.01, 0.25, 0.5, 0.75, 0.99]).T

	count	mean	std	min	1%	25%	50%	75%	99%	max
lowestprice_pre	659689.0	304.439507	287.192512	38.0	38.0	118.0	208.0	385.0	1750.0	1750.0
lowestprice	687931.0	305.025771	297.382838	37.0	37.0	116.0	200.0	380.0	1823.0	1823.0

# 深拷贝，不随原数据而改变
df_copy = df.copy(deep=True)

# 去除标签和编号的数据
X = df_copy.iloc[:, 2:]

# 标签列
y = df_copy.iloc[:, 0]

X.head(10)

	d	arrival	iforderpv_24h	decisionhabit_user	historyvisit_totalordernum	hotelcr	ordercanceledprecent	landhalfhours	ordercanncelednum	commentnums	...	lowestprice_pre2	lasthtlordergap	businessrate_pre2	cityuvs	cityorders	lastpvgap	cr	sid	visitnum_oneyear	h
0	2016-05-18	2016-05-18	0	NaN	NaN	1.04	NaN	22.0	NaN	1089.0	...	615.0	NaN	0.29	12.880	3.147	NaN	NaN	7	NaN	12
1	2016-05-18	2016-05-18	0	NaN	NaN	1.06	NaN	0.0	NaN	5612.0	...	513.0	NaN	0.53	17.933	4.913	NaN	NaN	33	NaN	14
2	2016-05-18	2016-05-19	0	NaN	NaN	1.05	NaN	3.0	NaN	256.0	...	382.0	NaN	0.60	3.993	0.760	NaN	NaN	10	NaN	19
3	2016-05-18	2016-05-18	0	NaN	NaN	1.01	NaN	2.0	NaN	NaN	...	203.0	NaN	0.18	3.220	0.660	NaN	NaN	8	NaN	16
4	2016-05-18	2016-05-19	0	NaN	NaN	1.00	NaN	0.0	NaN	NaN	...	84.0	NaN	NaN	0.013	NaN	NaN	NaN	1	NaN	21
5	2016-05-18	2016-05-20	0	NaN	NaN	1.02	NaN	0.0	NaN	15.0	...	408.0	NaN	NaN	2.880	0.427	NaN	NaN	1	NaN	21
6	2016-05-18	2016-05-25	0	NaN	NaN	1.12	NaN	0.0	NaN	2578.0	...	145.0	NaN	NaN	4.427	0.493	NaN	NaN	1	NaN	22
7	2016-05-18	2016-05-20	0	3.0	21.0	1.11	0.79	0.0	395.0	NaN	...	204.0	10475.0	0.53	12.713	1.987	7566.0	1.5	23	1265.0	17
8	2016-05-18	2016-05-19	0	13.0	NaN	1.08	NaN	0.0	NaN	2572.0	...	99.0	NaN	0.41	5.393	0.860	15.0	1.0	20	596.0	20
9	2016-05-18	2016-06-08	1	2.0	7.0	1.07	0.86	47.0	6.0	NaN	...	191.0	18873.0	0.52	3.093	0.287	288.0	1.0	31	21926.0	7

10 rows × 48 columns

# 数据集切分
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# 删除日期列
col_date = ['d', 'arrival']
X_train.drop(col_date, axis=1, inplace=True)
X_train.shape

(517458, 46)

# 对不同类型特征进行选择
col = X_train.columns.tolist()
col_no = ['sid', 'iforderpv_24h', 'h'] # 没有缺失值的数据，除去两个日期特征
col_clf = ['decisionhabit_user'] # 分类特征
col_neg = ['delta_price1', 'delta_price2', 'customer_value_profit', 'ctrip_profits', 'deltaprice_pre2_t1'] # 含有负数的特征
col_35 = ['ordernum_oneyear', 'lasthtlordergap', 'ordercanncelednum',
          'ordercanceledprecent', 'ctrip_profits', 'historyvisit_avghotelnum', 'historyvisit_totalordernum',  # 缺失值在35以上的特征
          'decisionhabit_user', 'firstorder_bu', 'historyvisit_visit_detailpagenum']
col_std = X_train.columns[X_train.describe(include='all').T['std'] > 100].to_list() # 方差大于100的列
col_std.remove('sid')
col_std.remove('delta_price2')
col_std.remove('delta_price1')
col_std.remove('lasthtlordergap')
col_norm = list(set(col) - set(col_no + col_clf + col_neg + col_35))

# 对训练集填充缺失值
X_train[col_clf] = X_train[col_clf].fillna(X_train[col_clf].mode())

X_train[col_neg] = X_train[col_neg].fillna(X_train[col_neg].median())

X_train[col_35] = X_train[col_35].fillna(-1)

X_train[col_std] = X_train[col_std].fillna(X_train[col_std].median())

X_train[col_norm] = X_train[col_norm].fillna(X_train[col_norm].mean())

# 对测试集填充缺失值
X_test[col_clf] = X_test[col_clf].fillna(X_test[col_clf].mode())

X_test[col_neg] = X_test[col_neg].fillna(X_test[col_neg].median())

X_test[col_35] = X_test[col_35].fillna(-1)

X_test[col_std] = X_test[col_std].fillna(X_test[col_std].median())

X_test[col_norm] = X_test[col_norm].fillna(X_test[col_norm].mean())

# 查看缺失数据数
X_train.isnull().any().sum()

X_test.isnull().any().sum()

X_train.shape

(517458, 46)

X_test.shape

(172487, 48)

# 方差过滤
selector = VarianceThreshold()

X_train_var = selector.fit_transform(X_train)
X_train_var.shape

(517458, 46)

# F检验
f, p_values = f_classif(X_train, y_train)

(p_values > 0.01).sum()

# F检验筛选之后的训练集
col_f = X_train.columns[p_values <= 0.01]
X_train = X_train[col_f]
X_train.shape

(517458, 40)

X_test = X_test[col_f]
X_test.shape

(172487, 40)

# 重置索引
X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# 建模，生成特征重要性
rfc = RFC(n_estimators=10, random_state=42)
importances = rfc.fit(X_train, y_train).feature_importances_
importances

array([0.0116028 , 0.01896167, 0.01802851, 0.0150776 , 0.01591757,
       0.02015549, 0.02034934, 0.0203898 , 0.01943307, 0.0211841 ,
       0.01791408, 0.02056686, 0.02291317, 0.02291462, 0.02051174,
       0.02323111, 0.02106971, 0.01054197, 0.02306534, 0.01938803,
       0.0258305 , 0.02571515, 0.02508617, 0.02485002, 0.02534584,
       0.02850795, 0.02408976, 0.02625239, 0.0279376 , 0.02743348,
       0.02772216, 0.03058076, 0.02754623, 0.04068007, 0.03904556,
       0.03655198, 0.03553142, 0.03861745, 0.04116244, 0.03829647])

# 交叉检验和嵌入法，画出学习曲线
scores = []
thresholds = np.linspace(0, importances.max(), 20)
for i in thresholds:
    time0 = time()
    X_embedded = SelectFromModel(rfc, threshold=i).fit_transform(X_train, y_train)
    score = cross_val_score(rfc, X_embedded, y_train, cv=5, n_jobs=-1).mean()
    scores.append(score)
    print(datetime.datetime.fromtimestamp(time() - time0).strftime('%M:%S:%f'))

plt.plot(thresholds, scores)
plt.show()

01:12:090613
01:13:526636
01:09:249811
01:08:676264
01:07:224143
01:06:510684
01:14:200829
01:13:976302
01:13:173232
01:07:879761
01:05:081422
01:03:703478
00:57:668075
00:55:913328
00:49:275084
00:47:250707
00:48:596243
00:53:333874
00:45:020932
00:53:343730

# 查看最大分数
max(scores)

0.9507844100831351

# 查看最大分数对应的阈值
thresholds[scores.index(max(scores))]

0.028163774952387383

col_k =  X_train.columns[importances > 0.028163774952387383].to_list()

X_train_embedded = X_train[col_k]
X_train_embedded.head()

	ctrip_profits	lasthtlordergap	cityuvs	cityorders	lastpvgap	cr	sid	visitnum_oneyear	h
0	1.347	-1.0	3.787	0.387	2850.0	1.137405	3	1314.0	6
1	1.347	-1.0	0.127	0.007	2850.0	1.137405	7	1314.0	13
2	0.767	-1.0	18.973	3.600	7272.0	1.137405	457	348.0	12
3	15.433	1986.0	1.507	0.287	47.0	1.160000	430	20273.0	8
4	1.347	-1.0	1.433	0.167	20.0	1.000000	85	83.0	13

X_test_embedded = X_test[col_k]
X_test_embedded.head()

	ctrip_profits	lasthtlordergap	cityuvs	cityorders	lastpvgap	cr	sid	visitnum_oneyear	h
0	3.940	7224.0	7.147	0.580000	539.0	1.150000	220	4542.0	4
1	1.347	40911.0	0.447	0.060000	3.0	1.137687	81	3156.0	3
2	0.887	-1.0	4.313	0.460000	6532.0	1.000000	81	1026.0	17
3	1.347	-1.0	0.460	0.053000	363.0	1.000000	27	349.0	22
4	1.540	82256.0	0.060	2.246314	41.0	1.170000	63	811.0	10

# 画出热力图，查看相关性
plt.figure(figsize=(10,8))
sns.heatmap(X_train_embedded.corr(), annot=True, linewidths=1)

# 删除相关性高的特征
X_train_embedded.drop('cityuvs', axis=1, inplace=True)
X_test_embedded.drop('cityuvs', axis=1, inplace=True)

# 保存清洗之后的数据
X_train_embedded.to_csv('X_train_embedded.csv')
X_test_embedded.to_csv('X_test_embedded.csv')
y_train.to_csv('y_train.csv')
y_test.to_csv('y_test.csv')

# 读取数据
X_train_embedded = pd.read_csv('X_train_embedded.csv', index_col=0)
X_test_embedded = pd.read_csv('X_test_embedded.csv', index_col=0)
y_train = pd.read_csv('y_train.csv', index_col=0)
y_test = pd.read_csv('y_test.csv', index_col=0)

y_train = np.ravel(y_train)
y_train.shape
y_test = np.ravel(y_test)
y_test.shape

(172487,)

# 'lasthtlordergap':'一年内距离上次下单时长'
# 'cityorders':'昨日提交当前城市同入住日期的app订单数'
# 'lastpvgap':'一年内距上次访问时长'
# 'cr':'用户转化率'
# 'sid':'会话id，sid=1可认为是新访'
# 'visitnum_oneyear':'年访问次数'
# 'h':'访问时间点'

import scipy

# 重新合并数据
woe_data = pd.concat([X_train_embedded, pd.Series(y_train, name='label')], axis=1)
woe_data

	ctrip_profits	lasthtlordergap	cityorders	lastpvgap	cr	sid	visitnum_oneyear	h	label
0	1.347	-1.0	0.387	2850.0	1.137405	3	1314.0	6	0
1	1.347	-1.0	0.007	2850.0	1.137405	7	1314.0	13	0
2	0.767	-1.0	3.600	7272.0	1.137405	457	348.0	12	1
3	15.433	1986.0	0.287	47.0	1.160000	430	20273.0	8	1
4	1.347	-1.0	0.167	20.0	1.000000	85	83.0	13	0
...	...	...	...	...	...	...	...	...	...
517453	1.347	-1.0	0.113	4347.0	1.137405	8	278.0	21	0
517454	1.347	41045.0	0.520	2972.0	1.330000	25	1095.0	0	0
517455	1.347	113046.0	0.093	522.0	1.137405	120	6309.0	16	0
517456	-0.067	266544.0	0.600	28378.0	1.000000	22	100.0	9	0
517457	1.347	-1.0	0.420	2850.0	1.137405	5	1314.0	17	0

517458 rows × 9 columns

# 计算woe
def get_woe(num_bins):
    columns = ['min', 'max', 'count_0', 'count_1']
    df = pd.Dataframe(num_bins, columns=columns)
    
    df['total'] = df['count_0'] + df['count_1']
    df['percentage'] = df['total'] / df['total'].sum()
    df['bad_rate'] = df['count_1'] / df['total']
    df['good%'] = df['count_0'] / df['count_0'].sum()
    df['bad%'] = df['count_1'] / df['count_1'].sum()
    df['good-bad'] = df['good%'] - df['bad%']
    df['woe'] = np.log(df['good%'] / df['bad%'])
    return df

# 计算IV值,返回IV值
def get_iv(df):
    iv = np.sum(df['good-bad'] * df['woe'])
    return iv

# 返回详细矩阵
def get_bin(X, q):
    df = woe_data.copy()
    df['qcut'], updown = pd.qcut(df[X], retbins=True, q=q, duplicates='drop')
    count_0 = df[df['label']==0].groupby('qcut').count()['label']
    count_1 = df[df['label']==1].groupby('qcut').count()['label']
    num_bins = [*zip(updown,updown[1:],count_0,count_1)]
    woe_df = get_woe(num_bins)
    
    return woe_df

# 作图，查看不同分箱方式
def get_graph(X, n=2, q=20):
    df = woe_data.copy()
    df['qcut'], updown = pd.qcut(df[X], retbins=True, q=q, duplicates='drop')
    count_0 = df[df['label']==0].groupby('qcut').count()['label']
    count_1 = df[df['label']==1].groupby('qcut').count()['label']
    num_bins = [*zip(updown,updown[1:],count_0,count_1)]

    IV = []
    axisx = []

    while len(num_bins) > n:
        pvs = []
        for i in range(len(num_bins)-1):
            x1 = num_bins[i][2:]
            x2 = num_bins[i+1][2:]
            pv = scipy.stats.chi2_contingency([x1,x2])[1]
            pvs.append(pv)
        i = pvs.index(max(pvs))
        num_bins[i:i+2] = [(num_bins[i][0], num_bins[i+1][1],
                           num_bins[i][2] + num_bins[i+1][2],
                           num_bins[i][3] + num_bins[i+1][3])]
        woe_df = get_woe(num_bins)
        axisx.append(len(num_bins))
        IV.append(get_iv(woe_df))

    plt.figure()
    plt.plot(axisx, IV)
    plt.xticks(axisx)
    plt.xlabel("number of box")
    plt.ylabel("IV")
    plt.show()

col_woe = ['ctrip_profits', 'lasthtlordergap', 'cityorders',
          'lastpvgap', 'cr', 'sid', 'visitnum_oneyear', 'h']
for i in col_woe:
    print(i)
    get_graph(i)

ctrip_profits

lasthtlordergap

cityorders

lastpvgap

cr

sid

visitnum_oneyear

# 发现上次下单时长在2356-29219内的用户流失率比较高
get_bin('lasthtlordergap', 10)

	min	max	count_0	count_1	total	percentage	bad_rate	good%	bad%	good-bad	woe
0	-1.0	2356.0	158912	48086	206998	0.400029	0.232302	0.423198	0.338741	0.084457	0.222603
1	2356.0	13291.0	30983	20754	51737	0.099983	0.401144	0.082511	0.146201	-0.063691	-0.572057
2	13291.0	29219.0	34175	17574	51749	0.100006	0.339601	0.091011	0.123800	-0.032789	-0.307683
3	29219.0	56455.0	35511	16229	51740	0.099989	0.313664	0.094569	0.114325	-0.019756	-0.189714
4	56455.0	110984.0	37245	14499	51744	0.099997	0.280206	0.099187	0.102138	-0.002951	-0.029318
5	110984.0	232020.0	39177	12572	51749	0.100006	0.242942	0.104332	0.088563	0.015769	0.163861
6	232020.0	527026.0	39500	12241	51741	0.099991	0.236582	0.105192	0.086232	0.018961	0.198753

# 发现用户转化率在1.12以下时用户留存较多
get_bin('cr',10)

	min	max	count_0	count_1	total	percentage	bad_rate	good%	bad%	good-bad	woe
0	1.000000	1.120000	166695	42471	209166	0.404218	0.203049	0.443925	0.299186	0.144738	0.394588
1	1.120000	1.137405	135099	47248	182347	0.352390	0.259110	0.359781	0.332838	0.026944	0.077841
2	1.137405	1.170000	15752	7000	22752	0.043969	0.307665	0.041949	0.049311	-0.007362	-0.161699
3	1.170000	1.330000	37674	24461	62135	0.120077	0.393675	0.100329	0.172315	-0.071986	-0.540866
4	1.330000	11.000000	20283	20775	41058	0.079346	0.505992	0.054016	0.146349	-0.092334	-0.996724

# 发现客户留存结果随着订单数增加而逐渐降低，除了在1.4--2.25的区间中有了明显的回升
get_bin('cityorders',10)

	min	max	count_0	count_1	total	percentage	bad_rate	good%	bad%	good-bad	woe
0	0.007000	0.033000	44555	11516	56071	0.108359	0.205382	0.118654	0.081124	0.037530	0.380231
1	0.033000	0.093000	39744	10584	50328	0.097260	0.210300	0.105842	0.074559	0.031283	0.350359
2	0.093000	0.200000	40294	11095	51389	0.099310	0.215902	0.107307	0.078159	0.029148	0.316952
3	0.200000	0.380000	37656	11761	49417	0.095500	0.237995	0.100281	0.082850	0.017431	0.190947
4	0.380000	0.753000	37170	14675	51845	0.100192	0.283055	0.098987	0.103378	-0.004391	-0.043400
5	0.753000	1.400000	35492	16268	51760	0.100027	0.314297	0.094519	0.114600	-0.020081	-0.192649
6	1.400000	2.255565	49463	16530	65993	0.127533	0.250481	0.131725	0.116445	0.015279	0.123292
7	2.255565	3.260000	25531	11738	37269	0.072023	0.314953	0.067991	0.082688	-0.014697	-0.195694
8	3.260000	6.633000	32872	18901	51773	0.100053	0.365074	0.087541	0.133148	-0.045607	-0.419350
9	6.633000	14.507000	32726	18887	51613	0.099743	0.365935	0.087152	0.133049	-0.045897	-0.423060

# 发现在晚上七点之后访问的用户，流失率较低，白天访问的用户流失率较高
get_bin('h',10)

	min	max	count_0	count_1	total	percentage	bad_rate	good%	bad%	good-bad	woe
0	0.0	6.0	42287	15678	57965	0.112019	0.270474	0.112614	0.110443	0.002171	0.019465
1	6.0	10.0	46850	22957	69807	0.134904	0.328864	0.124766	0.161720	-0.036954	-0.259428
2	10.0	12.0	34400	16455	50855	0.098279	0.323567	0.091610	0.115917	-0.024307	-0.235329
3	12.0	13.0	19815	8752	28567	0.055206	0.306367	0.052769	0.061653	-0.008884	-0.155599
4	13.0	15.0	38660	18137	56797	0.109762	0.319330	0.102955	0.127766	-0.024811	-0.215905
5	15.0	17.0	42537	18293	60830	0.117555	0.300723	0.113280	0.128865	-0.015585	-0.128901
6	17.0	19.0	40305	16122	56427	0.109047	0.285714	0.107336	0.113571	-0.006235	-0.056466
7	19.0	21.0	49680	15527	65207	0.126014	0.238119	0.132303	0.109380	0.022923	0.190266
8	21.0	22.0	31405	6631	38036	0.073505	0.174335	0.083634	0.046712	0.036922	0.582455
9	22.0	23.0	29564	3403	32967	0.063710	0.103224	0.078732	0.023972	0.054759	1.189144

# 客户价值并非越高流失率越低，在1.147-1.347时流失率最低的区间
get_bin('ctrip_profits',10)

	min	max	count_0	count_1	total	percentage	bad_rate	good%	bad%	good-bad	woe
0	-44.313	0.147	37921	13979	51900	0.100298	0.269345	0.100987	0.098475	0.002512	0.025192
1	0.147	0.500	37673	13923	51596	0.099711	0.269846	0.100327	0.098080	0.002246	0.022645
2	0.500	1.147	37713	14400	52113	0.100710	0.276323	0.100433	0.101441	-0.001007	-0.009980
3	1.147	1.347	150615	44404	195019	0.376879	0.227691	0.401102	0.312803	0.088299	0.248641
4	1.347	1.587	8296	3333	11629	0.022473	0.286611	0.022093	0.023479	-0.001386	-0.060856
5	1.587	3.220	36089	15701	51790	0.100085	0.303167	0.096108	0.110605	-0.014497	-0.140493
6	3.220	7.327	35310	16403	51713	0.099937	0.317193	0.094034	0.115551	-0.021517	-0.206054
7	7.327	600.820	31886	19812	51698	0.099908	0.383226	0.084915	0.139565	-0.054650	-0.496877

# 通过观察可以发现lastpvgap，sid，visitnum_oneyear的IV值过小，可以去掉这三个特征
X_train_woe = X_train_embedded[['ctrip_profits', 'lasthtlordergap', 'cityorders', 'cr', 'h']]
X_test_woe = X_test_embedded[['ctrip_profits', 'lasthtlordergap', 'cityorders', 'cr', 'h']]

# 最大深度学习曲线
scores = []
time0 = time()
for i in np.arange(5,21,1):
    rfc = RFC(n_estimators=10, max_depth=i, random_state=42)
    score = cross_val_score(rfc, X_train_embedded, y_train, cv=5, n_jobs=-1).mean()
    scores.append(score)

print('花费时间：{}'.format(datetime.datetime.fromtimestamp(time()-time0).strftime('%M:%S:%f')))
print('最大分数为{}，最大深度为{}'.format(max(scores), np.arange(5,21,1)[np.argmax(scores)]))
plt.figure(figsize=(10,8))
plt.plot(np.arange(5,21,1), scores)
plt.show()

花费时间：03:27:049065
最大分数为0.8891446300224614，最大深度为20

# 最小分割数学习曲线
scores = []
time0 = time()
for i in np.arange(2,10,1):
    rfc = RFC(n_estimators=10, max_depth=20, min_samples_split=i, random_state=42)
    score = cross_val_score(rfc, X_train_embedded, y_train, cv=5, n_jobs=-1).mean()
    scores.append(score)

print('花费时间：{}'.format(datetime.datetime.fromtimestamp(time()-time0).strftime('%M:%S:%f')))
print('最大分数为{}，最小分割数为{}'.format(max(scores), np.arange(2,10,1)[np.argmax(scores)]))
plt.figure(figsize=(10,8))
plt.plot(np.arange(2,10,1), scores)
plt.show()

花费时间：02:16:546873
最大分数为0.8891446300224614，最小分割数为2

# 最小叶子节点样本数学习曲线
scores = []
time0 = time()
for i in np.arange(1,10,1):
    rfc = RFC(n_estimators=10, max_depth=20, min_samples_split=2, min_samples_leaf=i, random_state=42)
    score = cross_val_score(rfc, X_train_embedded, y_train, cv=5, n_jobs=-1).mean()
    scores.append(score)

print('花费时间：{}'.format(datetime.datetime.fromtimestamp(time()-time0).strftime('%M:%S:%f')))
print('最大分数为{}，小叶子节点样本数{}'.format(max(scores), np.arange(1,10,1)[np.argmax(scores)]))
plt.figure(figsize=(10,8))
plt.plot(np.arange(1,10,1), scores)
plt.show()

花费时间：02:13:264602
最大分数为0.8891446300224614，小叶子节点样本数1

# 树数量学习曲线
scores = []
time0 = time()
for i in np.arange(10,201,10):
    rfc = RFC(n_estimators=i, max_depth=20, random_state=42)
    score = cross_val_score(rfc, X_train_embedded, y_train, cv=3, n_jobs=-1).mean()
    scores.append(score)
    print('花费时间：{}'.format(datetime.datetime.fromtimestamp(time()-time0).strftime('%M:%S:%f')))
print('最大分数为{}，树数量{}'.format(max(scores), np.arange(10,201,10)[np.argmax(scores)]))
plt.figure(figsize=(10,8))
plt.plot(np.arange(10,201,10), scores)
plt.show()

花费时间：00:33:316156
花费时间：01:39:789986
花费时间：03:21:423289
花费时间：05:40:185622
花费时间：08:30:948403
花费时间：11:56:105026
花费时间：15:51:769696
花费时间：20:36:914026
花费时间：25:38:212320
花费时间：31:10:214501
花费时间：37:27:759795
花费时间：44:10:525391
花费时间：51:28:059915
花费时间：59:16:547367
花费时间：07:41:280818
花费时间：16:38:253700
花费时间：26:07:070954
花费时间：36:06:570602
花费时间：48:40:747174
花费时间：00:14:880475
最大分数为0.8988864801364768，树数量100

# 选出最佳模型参数
rfc = RFC(n_estimators=100, max_depth=20, random_state=42).fit(X_train_embedded, y_train)

# 查看训练集和测试集上的得分
print('训练集得分为{}'.format(rfc.score(X_train_embedded, y_train)))
print('测试集得分为{}'.format(rfc.score(X_train_embedded, y_test)))

训练集得分为0.9144162424776504
测试集得分为0.8858812548192038

# 查看特征重要性
rfc.feature_importances_

array([0.12193391, 0.12869867, 0.14163503, 0.13799971, 0.10331983,
       0.12834216, 0.14547079, 0.09259991])

# 测试集预测概率
y_scores = rfc.predict_proba(X_train_embedded)
y_scores

array([[0.42810513, 0.57189487],
       [0.90260411, 0.09739589],
       [0.87430575, 0.12569425],
       ...,
       [0.3633656 , 0.6366344 ],
       [0.79112542, 0.20887458],
       [0.14388709, 0.85611291]])

from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, y_scores[:, 1])
roc_auc = auc(fpr, tpr)
roc_auc

0.9680887508287116

# 绘制roc曲线
def draw_roc(roc_auc, fpr, tpr):
    plt.subplots(figsize=(7,5.5))
    plt.plot(fpr, tpr, color='orange', label='roc curve(area={})'.format(roc_auc))
    plt.plot([0,1], [0,1], color='blue', linestyle='--')
    plt.xlabel('fpr')
    plt.ylabel('tpr')
    plt.xlim([0,1])
    plt.ylim([0,1.05])
    plt.title('ROC Curve')
    plt.legend(loc=4)
    plt.show()

draw_roc(roc_auc, fpr, tpr)

# RFM计算
rfm = df[['sampleid','ordernum_oneyear','avgprice','lasthtlordergap']]
rfm.head()

	sampleid	ordernum_oneyear	avgprice	lasthtlordergap
0	24636	NaN	NaN	NaN
1	24637	NaN	NaN	NaN
2	24641	NaN	NaN	NaN
3	24642	NaN	NaN	NaN
4	24644	NaN	NaN	NaN

# RFM模型重命名
rfm = rfm.dropna().reset_index(drop=True).rename(columns={'ordernum_oneyear':'F', 'avgprice':'M', 'lasthtlordergap':'R'})
rfm.head()

	sampleid	F	M	R
0	24650	21.0	363.0	10475.0
1	24653	7.0	307.0	18873.0
2	24655	1.0	343.0	32071.0
3	24658	33.0	1000.0	4616.0
4	24662	4.0	685.0	44830.0

# 通过计算得出R的单位是分钟，可以将其转换成天
rfm['R'] = round(rfm['R'] / 1440, 0)
rfm.head()

	sampleid	F	M	R
0	24650	21.0	363.0	7.0
1	24653	7.0	307.0	13.0
2	24655	1.0	343.0	22.0
3	24658	33.0	1000.0	3.0
4	24662	4.0	685.0	31.0

rfm.describe().T

	count	mean	std	min	25%	50%	75%	max
sampleid	426425.0	629380.138599	414760.183032	24650.0	313549.0	600907.0	887813.0	2238403.0
F	426425.0	12.137916	17.405419	1.0	3.0	6.0	14.0	711.0
M	426425.0	421.604962	286.987700	1.0	233.0	351.0	523.0	6383.0
R	426425.0	70.742163	84.844780	0.0	11.0	33.0	97.0	366.0

# 这里根据数据分布情况以及常规思路，对RFM进行划分
f_bins = [-1, 3, 5, 7, 10, 720]
m_bins = [-1, 200, 400, 600, 800, 7000]
r_bins = [-1, 3, 7, 30, 180, 370]

rfm['R_score'] = pd.cut(rfm['R'], bins=r_bins, labels=[5,4,3,2,1]).astype('int')
rfm['F_score'] = pd.cut(rfm['F'], bins=f_bins, labels=[1,2,3,4,5]).astype('int')
rfm['M_score'] = pd.cut(rfm['M'], bins=m_bins, labels=[1,2,3,4,5]).astype('int')

rfm

	sampleid	F	M	R	R_score	F_score	M_score
0	24650	21.0	363.0	7.0	4	5	2
1	24653	7.0	307.0	13.0	3	3	2
2	24655	1.0	343.0	22.0	3	1	2
3	24658	33.0	1000.0	3.0	5	5	5
4	24662	4.0	685.0	31.0	2	2	4
...	...	...	...	...	...	...	...
426420	2238388	2.0	226.0	119.0	2	1	2
426421	2238389	4.0	461.0	0.0	5	2	3
426422	2238396	5.0	193.0	44.0	2	2	1
426423	2238397	1.0	258.0	87.0	2	1	2
426424	2238403	3.0	256.0	52.0	2	1	2

426425 rows × 7 columns

# 大于平均分时记为1，否则记为0
rfm['R_level'] = (rfm['R_score'] > rfm['R_score'].mean()) * 1
rfm['F_level'] = (rfm['F_score'] > rfm['F_score'].mean()) * 1
rfm['M_level'] = (rfm['M_score'] > rfm['M_score'].mean()) * 1

rfm

	sampleid	F	M	R	R_score	F_score	M_score	R_level	F_level	M_level
0	24650	21.0	363.0	7.0	4	5	2	1	1	0
1	24653	7.0	307.0	13.0	3	3	2	1	1	0
2	24655	1.0	343.0	22.0	3	1	2	1	0	0
3	24658	33.0	1000.0	3.0	5	5	5	1	1	1
4	24662	4.0	685.0	31.0	2	2	4	0	0	1
...	...	...	...	...	...	...	...	...	...	...
426420	2238388	2.0	226.0	119.0	2	1	2	0	0	0
426421	2238389	4.0	461.0	0.0	5	2	3	1	0	1
426422	2238396	5.0	193.0	44.0	2	2	1	0	0	0
426423	2238397	1.0	258.0	87.0	2	1	2	0	0	0
426424	2238403	3.0	256.0	52.0	2	1	2	0	0	0

426425 rows × 10 columns

# 合并数据，并根据RFM标签来对用户进行划分
rfm['RFM'] = pd.concat([rfm['R_level'].astype('str') + rfm['F_level'].astype('str') + rfm['M_level'].astype('str')])
rfm['RFM'].replace(['111','101','011','001','110','100','010','000']
            , ['重要价值用户','重要发展用户','重要保持用户','重要挽留用户','一般价值用户','一般发展用户','一般保持用户','一般挽留用户'], inplace=True)
rfm

	sampleid	F	M	R	R_score	F_score	M_score	R_level	F_level	M_level	RFM
0	24650	21.0	363.0	7.0	4	5	2	1	1	0	一般价值用户
1	24653	7.0	307.0	13.0	3	3	2	1	1	0	一般价值用户
2	24655	1.0	343.0	22.0	3	1	2	1	0	0	一般发展用户
3	24658	33.0	1000.0	3.0	5	5	5	1	1	1	重要价值用户
4	24662	4.0	685.0	31.0	2	2	4	0	0	1	重要挽留用户
...	...	...	...	...	...	...	...	...	...	...	...
426420	2238388	2.0	226.0	119.0	2	1	2	0	0	0	一般挽留用户
426421	2238389	4.0	461.0	0.0	5	2	3	1	0	1	重要发展用户
426422	2238396	5.0	193.0	44.0	2	2	1	0	0	0	一般挽留用户
426423	2238397	1.0	258.0	87.0	2	1	2	0	0	0	一般挽留用户
426424	2238403	3.0	256.0	52.0	2	1	2	0	0	0	一般挽留用户

426425 rows × 11 columns

# 统计各个类型用户的数量
rfm_new = pd.Dataframe(rfm.groupby('RFM', as_index=False)['sampleid'].agg('count'))
rfm_new

	RFM	sampleid
0	一般价值用户	78592
1	一般保持用户	46850
2	一般发展用户	42275
3	一般挽留用户	83394
4	重要价值用户	63595
5	重要保持用户	38850
6	重要发展用户	20235
7	重要挽留用户	52634

# 绘制饼状图观察结果
plt.figure(figsize=(12,6))
plt.pie((rfm_new['sampleid'] / rfm_new['sampleid'].sum()).to_list(), labels=rfm_new['RFM'].to_list(), autopct='%0.2f%%')

 [Text(0.9207056795449674, 0.6019144886556893, '一般价值用户'),
  Text(0.07432600254562635, 1.0974860570164833, '一般保持用户'),
  Text(-0.6110720650087508, 0.9146534487804336, '一般发展用户'),
  Text(-1.0982775444537256, 0.061534017816936265, '一般挽留用户'),
  Text(-0.5691837587192285, -0.9412915854347425, '重要价值用户'),
  Text(0.23025758705583027, -1.0756307189752563, '重要保持用户'),
  Text(0.6623554620643879, -0.8782284679247601, '重要发展用户'),
  Text(1.0183302486279413, -0.4159368999371846, '重要挽留用户')],
 [Text(0.5022030979336185, 0.3283169938121941, '18.43%'),
  Text(0.04054145593397801, 0.5986287583726272, '10.99%'),
  Text(-0.33331203545931853, 0.49890188115296374, '9.91%'),
  Text(-0.5990604787929412, 0.03356400971832887, '19.56%'),
  Text(-0.31046386839230644, -0.5134317738734958, '14.91%'),
  Text(0.1255950474849983, -0.5867076648955943, '9.11%'),
  Text(0.3612847974896661, -0.47903370977714177, '4.75%'),
  Text(0.5554528628879679, -0.22687467269300973, '12.34%')])

携程客户流失分析项目（个人练习+源代码）

Python相关栏目本月热门文章