Swiggy Data Science Assessment - LinkedIn, MTV Get a Job
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
Import Dataset and Quick Inspection
dataset_path = './data/SampleAssessment.csv'
df = pd.read_csv(dataset_path)
df.head()
df.tail()
df['first_time'] = pd.to_datetime(df['first_time'])
df['recent_time'] = pd.to_datetime(df['recent_time'])
df.head()
Number of missing values (orders in 7 days and last 4 weeks)
df.isna().sum()
customer_id 0
first_time 0
recent_time 0
no_of_orders 0
orders_7_days 8077
orders_last_4_weeks 5659
amount 0
amt_last_7_days 0
amt_in_last_4_weeks 0
avg_dist_rest 0
avg_del_time 0
dtype: int64
Verifying whether the amount is zero for no orders
(df['amt_last_7_days'] == 0).sum(axis=0)
8078
(df['amt_in_last_4_weeks'] == 0).sum(axis=0)
5660
Description of the subset of dataset with negative restaurant distances
df[df['avg_dist_rest'] < 0].describe().round(2)
Description of the original dataset with negative restaurant distances
df.describe().round(2)
Description of the dataset without negative restaurant distances
df[df['avg_dist_rest'] > 0].describe().round(2)
- The mean and standard deviation are not much affected by the removal of negative disatances
- The above dataset will be use for further evaluations
- The rows with negative values are thereby discarded
df = df[df['avg_dist_rest'] > 0]
df.describe().round(2)
Correlation
df.corr()
- It appears that there are no other important correlations other than order vs amount
plt.figure(figsize=(20,10))
plt.title('Number of orders vs Amount spent', fontsize=26)
plt.xlabel('Number of Orders', fontsize=24)
plt.ylabel('Amount Spent', fontsize=24)
plt.scatter(df['no_of_orders'], df['amount'])
plt.savefig('./plots/orders_vs_amount.png', format='png', dpi=1000)
plt.show()
plt.figure(figsize=(20,10))
plt.title('Distribution of amount', fontsize=26)
plt.ylabel('Amount Spent', fontsize=24)
plt.hist(df['amount'], bins=[0, 10000, 20000, 30000, 40000, 50000, 60000, 70000])
plt.savefig('./plots/distribution_amount.png', format='png', dpi=300)
plt.show()
plt.figure(figsize=(20,10))
plt.title('Distribution of distances', fontsize=26)
plt.ylabel('Average Distances', fontsize=24)
plt.hist(df['avg_dist_rest'])
plt.savefig('./plots/distribution_distances.png', format='png', dpi=300)
plt.show()
plt.figure(figsize=(20,10))
plt.title('Distribution of delivery time', fontsize=26)
plt.ylabel('Average Delivery Time', fontsize=24)
plt.hist(df['avg_del_time'])
plt.savefig('./plots/distribution_delivery.png', format='png', dpi=300)
plt.show()
df.head()
Calculating customer Recency
latest_recent_time = df['recent_time'].max()
df['recency'] = df['recent_time'].apply(lambda x: (latest_recent_time - x).days)
Delivery score
- A custom score given to every customer
df['delivery_score'] = df['avg_del_time'] / df['avg_dist_rest']
df.head()
quantiles = df.quantile(q=[0.25,0.5,0.75])
quantiles.to_dict()
{'customer_id': {0.25: 335730.75, 0.5: 667378.0, 0.75: 1004865.75},
'no_of_orders': {0.25: 1.0, 0.5: 2.0, 0.75: 7.0},
'orders_7_days': {0.25: 1.0, 0.5: 1.0, 0.75: 2.0},
'orders_last_4_weeks': {0.25: 1.0, 0.5: 2.0, 0.75: 4.0},
'amount': {0.25: 280.0, 0.5: 691.0, 0.75: 2060.75},
'amt_last_7_days': {0.25: 0.0, 0.5: 0.0, 0.75: 0.0},
'amt_in_last_4_weeks': {0.25: 0.0, 0.5: 0.0, 0.75: 401.75},
'avg_dist_rest': {0.25: 1.7, 0.5: 2.4, 0.75: 3.1},
'avg_del_time': {0.25: 26.0, 0.5: 37.0, 0.75: 47.0},
'recency': {0.25: 39.0, 0.5: 65.0, 0.75: 111.0},
'delivery_score': {0.25: 10.74074074074074,
0.5: 15.65217391304348,
0.75: 22.5}}
# Arguments (x = value, p = recency, monetary_value, frequency, d = quartiles dict)
def RScore(x, p, d):
if x <= d[p][0.25]:
return 4
elif x <= d[p][0.50]:
return 3
elif x <= d[p][0.75]:
return 2
else:
return 1
# Arguments (x = value, p = recency, monetary_value, frequency, d = quartiles dict)
def FMScore(x, p, d):
if x <= d[p][0.25]:
return 1
elif x <= d[p][0.50]:
return 2
elif x <= d[p][0.75]:
return 3
else:
return 4
RFM Segemtation for last 7 days of customer orders
rfm_segmentation_7_days = df
rfm_segmentation_7_days['r_quartile_7_days'] = rfm_segmentation_7_days['recency'].apply(RScore, args=('recency',quantiles,))
rfm_segmentation_7_days['f_quartile_7_days'] = rfm_segmentation_7_days['orders_7_days'].apply(FMScore, args=('orders_7_days',quantiles,))
rfm_segmentation_7_days['m_quartile_7_days'] = rfm_segmentation_7_days['amt_last_7_days'].apply(FMScore, args=('amt_last_7_days',quantiles,))
rfm_segmentation_7_days.head()
RFM Segemtation for last 4 weeks of customer orders
rfm_segmentation_4_weeks = df
rfm_segmentation_4_weeks['r_quartile_4_weeks'] = rfm_segmentation_4_weeks['recency'].apply(RScore, args=('recency',quantiles,))
rfm_segmentation_4_weeks['f_quartile_4_weeks'] = rfm_segmentation_4_weeks['orders_last_4_weeks'].apply(FMScore, args=('orders_last_4_weeks',quantiles,))
rfm_segmentation_4_weeks['m_quartile_4_weeks'] = rfm_segmentation_4_weeks['amt_in_last_4_weeks'].apply(FMScore, args=('amt_in_last_4_weeks',quantiles,))
rfm_segmentation_4_weeks.head()
RFMScore = (r * f * m) * delivery_score
rfm_segmentation_7_days['RFMScore_7_days'] = rfm_segmentation_7_days.r_quartile_7_days \
* rfm_segmentation_7_days.f_quartile_7_days \
* rfm_segmentation_7_days.m_quartile_7_days \
* rfm_segmentation_7_days.delivery_score
rfm_segmentation_7_days.head()
rfm_segmentation_7_days.describe()
RFMScore = (r * f * m) * delivery_score
rfm_segmentation_4_weeks['RFMScore_4_weeks'] = rfm_segmentation_4_weeks.r_quartile_4_weeks \
* rfm_segmentation_4_weeks.f_quartile_4_weeks \
* rfm_segmentation_4_weeks.m_quartile_4_weeks \
* rfm_segmentation_4_weeks.delivery_score
rfm_segmentation_4_weeks.head()
rfm_segmentation_7_days.describe()['RFMScore_7_days']
count 9934.000000
mean 256.353922
std 395.715899
min 14.117647
25% 83.478261
50% 152.727273
75% 271.562998
max 11093.333333
Name: RFMScore_7_days, dtype: float64
rfm_segmentation_4_weeks.describe()['RFMScore_4_weeks']
count 9934.000000
mean 321.186736
std 500.580048
min 14.117647
25% 82.666667
50% 157.241379
75% 355.555556
max 12480.000000
Name: RFMScore_4_weeks, dtype: float64
Moving on to K Means Clustering
rfm_segmentation_7_days_values = rfm_segmentation_7_days[['amt_last_7_days', 'RFMScore_7_days']].values
rfm_segmentation_4_weeks_values = rfm_segmentation_4_weeks[['amt_in_last_4_weeks', 'RFMScore_4_weeks']].values
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(rfm_segmentation_7_days_values)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(20,10))
plt.plot(range(1, 11), wcss)
plt.title('Finding the number of clusters - Elbow curve - last 7 days', fontsize=26)
plt.xlabel('Number of clusters', fontsize=24)
plt.ylabel('WCSS')
plt.savefig('./plots/7_days_elbow.png', format='png', dpi=300)
plt.show()
The curve says that 2 would be the ideal number of clusters
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(rfm_segmentation_4_weeks_values)
wcss.append(kmeans.inertia_)
plt.figure(figsize=(20,10))
plt.plot(range(1, 11), wcss)
plt.title('Finding the number of clusters - Elbow curve - last 4 weeks', fontsize=26)
plt.xlabel('Number of clusters', fontsize=24)
plt.ylabel('WCSS')
plt.savefig('./plots/4_weeks_elbow.png', format='png', dpi=300)
plt.show()
The curve says that 3 would be the ideal number of clusters
Applying K Means to customer data of past 7 days
kmeans = KMeans(n_clusters=2, init='k-means++', max_iter=300, n_init=10, random_state=0)
y_kmeans = kmeans.fit_predict(rfm_segmentation_7_days_values)
# Visualizing the clusters
plt.figure(figsize=(20,10))
plt.scatter(rfm_segmentation_7_days_values[y_kmeans==0, 0], rfm_segmentation_7_days_values[y_kmeans==0, 1], s=100, c='red', label='Cluster 1', alpha=0.5)
plt.scatter(rfm_segmentation_7_days_values[y_kmeans==1, 0], rfm_segmentation_7_days_values[y_kmeans==1, 1], s=100, c='blue', label='Cluster 2', alpha=0.5)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='yellow', label='Centroids')
plt.title('Clusters of customers for last 7 days', fontsize=26)
plt.xlabel('Amount', fontsize=24)
plt.ylabel('RFMScore', fontsize=24)
plt.legend(prop={'size': 30})
plt.savefig('./plots/k_means_7_days.png', format='png', dpi=300)
plt.show()
Applying K Means to customer data of past 4 weeks
kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=0)
y_kmeans = kmeans.fit_predict(rfm_segmentation_4_weeks_values)
# Visualizing the clusters
plt.figure(figsize=(20,10))
plt.scatter(rfm_segmentation_4_weeks_values[y_kmeans==0, 0], rfm_segmentation_4_weeks_values[y_kmeans==0, 1], s=100, c='red', label='Cluster 1', alpha=0.5)
plt.scatter(rfm_segmentation_4_weeks_values[y_kmeans==1, 0], rfm_segmentation_4_weeks_values[y_kmeans==1, 1], s=100, c='blue', label='Cluster 2', alpha=0.5)
plt.scatter(rfm_segmentation_4_weeks_values[y_kmeans==2, 0], rfm_segmentation_4_weeks_values[y_kmeans==2, 1], s=100, c='green', label='Cluster 3', alpha=0.5)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='yellow', label='Centroids')
plt.title('Clusters of customers for last 4 weeks', fontsize=26)
plt.xlabel('Amount', fontsize=24)
plt.ylabel('RFMScore', fontsize=24)
plt.legend(prop={'size': 30})
plt.savefig('./plots/k_means_4_weeks.png', format='png', dpi=300)
plt.show()
Conclusion
- In a dataset of 10000 rows
- Recent orders (Last 7 days) can be categorized into two parts. One being least spending customers and the others are returning customers
- Past orders (Last 4 weeks) can be categorized into three parts. One being the least spending, another being loyal customers and the other being returning customers
- The customers in least spending customers (Red Cluster) have more orders but less amount spent
- The customers who are loyal (Green Cluster) are more like to spend money along with more orders
- The customers who are returning (Blue Cluster) are more likely to spend money eventhough their order quantity is less