Analyzing Data For A Food Tech Startup - Swiggy

by Sudhanva Narayana 4 years ago 5 min read

Swiggy Data Science Assessment - LinkedIn, MTV Get a Job

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Import Dataset and Quick Inspection

dataset_path = './data/SampleAssessment.csv'
df = pd.read_csv(dataset_path)

df.head()

df.tail()

df['first_time'] = pd.to_datetime(df['first_time'])
df['recent_time'] = pd.to_datetime(df['recent_time'])

df.head()

Number of missing values (orders in 7 days and last 4 weeks)

df.isna().sum()

customer_id               0
first_time                0
recent_time               0
no_of_orders              0
orders_7_days          8077
orders_last_4_weeks    5659
amount                    0
amt_last_7_days           0
amt_in_last_4_weeks       0
avg_dist_rest             0
avg_del_time              0
dtype: int64

Verifying whether the amount is zero for no orders

(df['amt_last_7_days'] == 0).sum(axis=0)

(df['amt_in_last_4_weeks'] == 0).sum(axis=0)

Description of the subset of dataset with negative restaurant distances

df[df['avg_dist_rest'] < 0].describe().round(2)

Description of the original dataset with negative restaurant distances

df.describe().round(2)

Description of the dataset without negative restaurant distances

df[df['avg_dist_rest'] > 0].describe().round(2)

The mean and standard deviation are not much affected by the removal of negative disatances
The above dataset will be use for further evaluations
The rows with negative values are thereby discarded

df = df[df['avg_dist_rest'] > 0]
df.describe().round(2)

Correlation

df.corr()

It appears that there are no other important correlations other than order vs amount

plt.figure(figsize=(20,10))
plt.title('Number of orders vs Amount spent', fontsize=26)
plt.xlabel('Number of Orders', fontsize=24)
plt.ylabel('Amount Spent', fontsize=24)
plt.scatter(df['no_of_orders'], df['amount'])
plt.savefig('./plots/orders_vs_amount.png', format='png', dpi=1000)
plt.show()

plt.figure(figsize=(20,10))
plt.title('Distribution of amount', fontsize=26)
plt.ylabel('Amount Spent', fontsize=24)
plt.hist(df['amount'], bins=[0, 10000, 20000, 30000, 40000, 50000, 60000, 70000])
plt.savefig('./plots/distribution_amount.png', format='png', dpi=300)
plt.show()

plt.figure(figsize=(20,10))
plt.title('Distribution of distances', fontsize=26)
plt.ylabel('Average Distances', fontsize=24)
plt.hist(df['avg_dist_rest'])
plt.savefig('./plots/distribution_distances.png', format='png', dpi=300)
plt.show()

plt.figure(figsize=(20,10))
plt.title('Distribution of delivery time', fontsize=26)
plt.ylabel('Average Delivery Time', fontsize=24)
plt.hist(df['avg_del_time'])
plt.savefig('./plots/distribution_delivery.png', format='png', dpi=300)
plt.show()

df.head()

Calculating customer Recency

latest_recent_time = df['recent_time'].max()
df['recency'] = df['recent_time'].apply(lambda x: (latest_recent_time - x).days)

Delivery score

A custom score given to every customer

df['delivery_score'] = df['avg_del_time'] / df['avg_dist_rest']

df.head()

quantiles = df.quantile(q=[0.25,0.5,0.75])
quantiles.to_dict()

{'customer_id': {0.25: 335730.75, 0.5: 667378.0, 0.75: 1004865.75},
 'no_of_orders': {0.25: 1.0, 0.5: 2.0, 0.75: 7.0},
 'orders_7_days': {0.25: 1.0, 0.5: 1.0, 0.75: 2.0},
 'orders_last_4_weeks': {0.25: 1.0, 0.5: 2.0, 0.75: 4.0},
 'amount': {0.25: 280.0, 0.5: 691.0, 0.75: 2060.75},
 'amt_last_7_days': {0.25: 0.0, 0.5: 0.0, 0.75: 0.0},
 'amt_in_last_4_weeks': {0.25: 0.0, 0.5: 0.0, 0.75: 401.75},
 'avg_dist_rest': {0.25: 1.7, 0.5: 2.4, 0.75: 3.1},
 'avg_del_time': {0.25: 26.0, 0.5: 37.0, 0.75: 47.0},
 'recency': {0.25: 39.0, 0.5: 65.0, 0.75: 111.0},
 'delivery_score': {0.25: 10.74074074074074,
  0.5: 15.65217391304348,
  0.75: 22.5}}

# Arguments (x = value, p = recency, monetary_value, frequency, d = quartiles dict)

def RScore(x, p, d):
    if x <= d[p][0.25]:
        return 4
    elif x <= d[p][0.50]:
        return 3
    elif x <= d[p][0.75]: 
        return 2
    else:
        return 1
    
# Arguments (x = value, p = recency, monetary_value, frequency, d = quartiles dict)
def FMScore(x, p, d):
    if x <= d[p][0.25]:
        return 1
    elif x <= d[p][0.50]:
        return 2
    elif x <= d[p][0.75]: 
        return 3
    else:
        return 4

RFM Segemtation for last 7 days of customer orders

rfm_segmentation_7_days = df
rfm_segmentation_7_days['r_quartile_7_days'] = rfm_segmentation_7_days['recency'].apply(RScore, args=('recency',quantiles,))
rfm_segmentation_7_days['f_quartile_7_days'] = rfm_segmentation_7_days['orders_7_days'].apply(FMScore, args=('orders_7_days',quantiles,))
rfm_segmentation_7_days['m_quartile_7_days'] = rfm_segmentation_7_days['amt_last_7_days'].apply(FMScore, args=('amt_last_7_days',quantiles,))
rfm_segmentation_7_days.head()

RFM Segemtation for last 4 weeks of customer orders

rfm_segmentation_4_weeks = df
rfm_segmentation_4_weeks['r_quartile_4_weeks'] = rfm_segmentation_4_weeks['recency'].apply(RScore, args=('recency',quantiles,))
rfm_segmentation_4_weeks['f_quartile_4_weeks'] = rfm_segmentation_4_weeks['orders_last_4_weeks'].apply(FMScore, args=('orders_last_4_weeks',quantiles,))
rfm_segmentation_4_weeks['m_quartile_4_weeks'] = rfm_segmentation_4_weeks['amt_in_last_4_weeks'].apply(FMScore, args=('amt_in_last_4_weeks',quantiles,))
rfm_segmentation_4_weeks.head()

RFMScore = (r * f * m) * delivery_score

rfm_segmentation_7_days['RFMScore_7_days'] = rfm_segmentation_7_days.r_quartile_7_days \
                            * rfm_segmentation_7_days.f_quartile_7_days \
                            * rfm_segmentation_7_days.m_quartile_7_days \
                            * rfm_segmentation_7_days.delivery_score
rfm_segmentation_7_days.head()

rfm_segmentation_7_days.describe()

RFMScore = (r * f * m) * delivery_score

rfm_segmentation_4_weeks['RFMScore_4_weeks'] = rfm_segmentation_4_weeks.r_quartile_4_weeks \
                            * rfm_segmentation_4_weeks.f_quartile_4_weeks \
                            * rfm_segmentation_4_weeks.m_quartile_4_weeks \
                            * rfm_segmentation_4_weeks.delivery_score
rfm_segmentation_4_weeks.head()

rfm_segmentation_7_days.describe()['RFMScore_7_days']

count     9934.000000
mean       256.353922
std        395.715899
min         14.117647
25%         83.478261
50%        152.727273
75%        271.562998
max      11093.333333
Name: RFMScore_7_days, dtype: float64

rfm_segmentation_4_weeks.describe()['RFMScore_4_weeks']

count     9934.000000
mean       321.186736
std        500.580048
min         14.117647
25%         82.666667
50%        157.241379
75%        355.555556
max      12480.000000
Name: RFMScore_4_weeks, dtype: float64

Moving on to K Means Clustering

rfm_segmentation_7_days_values = rfm_segmentation_7_days[['amt_last_7_days', 'RFMScore_7_days']].values
rfm_segmentation_4_weeks_values = rfm_segmentation_4_weeks[['amt_in_last_4_weeks', 'RFMScore_4_weeks']].values

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(rfm_segmentation_7_days_values)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(20,10))
plt.plot(range(1, 11), wcss)
plt.title('Finding the number of clusters - Elbow curve - last 7 days', fontsize=26)
plt.xlabel('Number of clusters', fontsize=24)
plt.ylabel('WCSS')
plt.savefig('./plots/7_days_elbow.png', format='png', dpi=300)
plt.show()

The curve says that 2 would be the ideal number of clusters

wcss = []
for i in range(1, 11):
    kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=0)
    kmeans.fit(rfm_segmentation_4_weeks_values)
    wcss.append(kmeans.inertia_)

plt.figure(figsize=(20,10))
plt.plot(range(1, 11), wcss)
plt.title('Finding the number of clusters - Elbow curve - last 4 weeks', fontsize=26)
plt.xlabel('Number of clusters', fontsize=24)
plt.ylabel('WCSS')
plt.savefig('./plots/4_weeks_elbow.png', format='png', dpi=300)
plt.show()

The curve says that 3 would be the ideal number of clusters

Applying K Means to customer data of past 7 days

kmeans = KMeans(n_clusters=2, init='k-means++', max_iter=300, n_init=10, random_state=0)
y_kmeans = kmeans.fit_predict(rfm_segmentation_7_days_values)

# Visualizing the clusters
plt.figure(figsize=(20,10))
plt.scatter(rfm_segmentation_7_days_values[y_kmeans==0, 0], rfm_segmentation_7_days_values[y_kmeans==0, 1], s=100, c='red', label='Cluster 1', alpha=0.5)
plt.scatter(rfm_segmentation_7_days_values[y_kmeans==1, 0], rfm_segmentation_7_days_values[y_kmeans==1, 1], s=100, c='blue', label='Cluster 2', alpha=0.5)
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='yellow', label='Centroids')
plt.title('Clusters of customers for last 7 days', fontsize=26)
plt.xlabel('Amount', fontsize=24)
plt.ylabel('RFMScore', fontsize=24)
plt.legend(prop={'size': 30})
plt.savefig('./plots/k_means_7_days.png', format='png', dpi=300)
plt.show()

Applying K Means to customer data of past 4 weeks

kmeans = KMeans(n_clusters=3, init='k-means++', max_iter=300, n_init=10, random_state=0)
y_kmeans = kmeans.fit_predict(rfm_segmentation_4_weeks_values)

# Visualizing the clusters
plt.figure(figsize=(20,10))
plt.scatter(rfm_segmentation_4_weeks_values[y_kmeans==0, 0], rfm_segmentation_4_weeks_values[y_kmeans==0, 1], s=100, c='red', label='Cluster 1', alpha=0.5)
plt.scatter(rfm_segmentation_4_weeks_values[y_kmeans==1, 0], rfm_segmentation_4_weeks_values[y_kmeans==1, 1], s=100, c='blue', label='Cluster 2', alpha=0.5)
plt.scatter(rfm_segmentation_4_weeks_values[y_kmeans==2, 0], rfm_segmentation_4_weeks_values[y_kmeans==2, 1], s=100, c='green', label='Cluster 3', alpha=0.5)

plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=300, c='yellow', label='Centroids')
plt.title('Clusters of customers for last 4 weeks', fontsize=26)
plt.xlabel('Amount', fontsize=24)
plt.ylabel('RFMScore', fontsize=24)
plt.legend(prop={'size': 30})
plt.savefig('./plots/k_means_4_weeks.png', format='png', dpi=300)
plt.show()

Conclusion

In a dataset of 10000 rows
- Recent orders (Last 7 days) can be categorized into two parts. One being least spending customers and the others are returning customers
- Past orders (Last 4 weeks) can be categorized into three parts. One being the least spending, another being loyal customers and the other being returning customers
The customers in least spending customers (Red Cluster) have more orders but less amount spent
The customers who are loyal (Green Cluster) are more like to spend money along with more orders
The customers who are returning (Blue Cluster) are more likely to spend money eventhough their order quantity is less

Sudhanva Narayana

Sudhanva Narayana is a senior software engineer in Boston with experience in Machine Learning, Platform Engineering, and Full Stack Web Development.