Catie Fuentes
Features of Aisles Dataframe:
aisle_id: the aisle identification number
aisle: the corresponding name of the aisle
Features of Departments Dataframe:
department_id: the department identification number
department: the corresponding name of the department
Features of Products Dataframe:
product_id: product identification number
product_name: the corresponding product name
aisle_id: aisle identification number where the product is located
department_id: department identification number that the product belongs to
Features of Orders Dataframe:
order_id: order identification number
user_id: user identification number
eval_set: whether the order was a prior order or a train order
order_number: order number
order_dow: day of the week the order was placed
order_hour_of_day: hour of day the order was placed
days_since_prior_order: days since the users last order
Features of Prior Order Dataframe:
order_id: order identification number
product_id: product identification number
add_to_cart_order: the order in which each product was added to the cart
reordered: whether or not the product was reordered
Features of Train Order Dataframe:
order_id: order identification number
product_id: product identification number
add_to_cart_order: the order in which each product was added to the cart
reordered: whether or not the product was reordered
%matplotlib inline
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import altair as alt
import requests
matplotlib.style.use('ggplot')
sns.set_style("whitegrid")
import json
import pickle
import scipy
alt.data_transformers.enable('json')
#!pip install kaggle # <- This is only needed the first time, I believe
# Install needed libraries
import json
# API credentials
token={"username":"catiefuentes","key":"4583b186134810a0769888ac7c442d00"}
# Create a folder in your Google account using the unix command "mkdir"
!mkdir ~/.kaggle
# Place the API credentials in the right place
with open('/root/.kaggle/kaggle.json','w') as file:
json.dump(token, file)
# Change the permissions of the file kaggle.json
!chmod 600 /root/.kaggle/kaggle.json
# Get a list of the datasets in kaggle with "instacart" in its name
!kaggle datasets list -s instacart
# Download the appropriate dataset and put it in the content folder
!kaggle datasets download -d psparks/instacart-market-basket-analysis -p /content
#Decript (unzip) all the encrypted files (with the ending ".zip")
!unzip \*.zip
#### COMMENTED OUT FOR USE ONLY IF KAGGLE API IS NOT WORKING ###
#url='http://136.183.139.117/dsa501data/instacart/'
#aisles=pd.read_csv(url+'aisles.csv')
#departments=pd.read_csv(url+'departments.csv')
#order_products_prior=pd.read_csv(url+'order_products__prior.csv')
#order_products_train=pd.read_csv(url+'order_products__train.csv')
#orders=pd.read_csv(url+'orders.csv')
#products=pd.read_csv(url+'products.csv')
#files=['aisles','departments','order_products__prior','order_products__train','orders','products']
#for file in files:
#globals()[file]=pd.read_csv(url+file+'.csv')
# created dataframes for each dataset
orders = pd.read_csv('orders.csv')
products = pd.read_csv('products.csv')
order_products_prior = pd.read_csv('order_products__prior.csv')
order_products_train = pd.read_csv('order_products__train.csv')
aisles = pd.read_csv('aisles.csv')
departments = pd.read_csv('departments.csv')
files=['aisles','departments','order_products__prior','order_products__train','orders','products']
for file in files:
globals()[file]=pd.read_csv(file+'.csv')
# Got this code from JVP book. It displays frames side by side
class display(object):
"""Display HTML representation of multiple objects"""
template = """<div style="float: left; padding: 10px;">
<p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
</div>"""
def __init__(self, *args):
self.args = args
def _repr_html_(self):
return '\n'.join(self.template.format(a, eval(a)._repr_html_())
for a in self.args)
def __repr__(self):
return '\n\n'.join(a + '\n' + repr(eval(a))
for a in self.args)
display(files[0],files[1],files[2])
display(files[3],files[4],files[5])
features_dict={}
for file in files:
features_dict[file]=[globals()[file].columns.values]
df=pd.DataFrame(features_dict).T
df.rename(columns = {0:'Features'}, inplace=True)
df.index.name = "File"
df = df.style.set_properties(**{'text-align': 'left'})
df
aisles.head()
departments.head()
orders.head()
orders.describe()
orders['user_id'].value_counts()
products.set_index('product_id', inplace=True)
products.head()
products.describe()
order_products__prior.head()
order_products__train.head()
plt.figure(figsize=(12,8))
sns.countplot(x="order_dow", data=orders)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Day of week', fontsize=12)
plt.xticks(rotation='vertical')
plt.title("Frequency of orders by day of week", fontsize=15)
plt.show()
plt.figure(figsize=(12,8))
sns.countplot(x="order_hour_of_day", data=orders)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Hour of day', fontsize=12)
plt.xticks(rotation='vertical')
plt.title("Frequency of orders by hour of day", fontsize=15)
plt.show()
order_products = order_products_prior.append(order_products_train)
order_products.shape
order_products.head()
order_products.product_id.nunique()
product_counts = order_products.groupby('product_id')['order_id'].count().reset_index().rename(columns = {'order_id':'frequency'})
product_counts = product_counts.sort_values('frequency', ascending=False)[0:100].reset_index(drop = True)
product_counts = product_counts.merge(products, on = 'product_id', how = 'left')
product_counts.head(10)
ax = sns.barplot(x="frequency", y="product_name", data=(product_counts).head(10))
plt.title("Frequency of top 10 ordered products", fontsize=15)
product_departments = product_counts.merge(departments)
product_departments
ax = sns.barplot(x="frequency", y="department", data=product_departments)
plt.title("Frequency of products ordered from each department", fontsize=15)
#merge tables: orders and order_products__prior
df=orders.merge(order_products__prior)
veg_df=df.merge(product_departments)
veg_df
#dropping any departments that would not be vegetarian-friendly
vegetarian = veg_df[veg_df["department"].str.contains("dairy eggs|meat seafood|deli") == False]
vegetarian = vegetarian.merge(aisles)
plant_df = vegetarian[vegetarian["aisle"].str.contains("meat|dairy|poultry|chicken|beef|fish|jerky") == False]
#filter to see only add_to_cart_order 1 & reorders
plant_df[(veg_df.add_to_cart_order==1) & (veg_df.reordered==1)]
plant_df.fillna(0, inplace = True)
plant_df.isnull().sum()
len(plant_df['user_id'].unique())
plt.rcParams['figure.figsize'] = (18, 7)
sns.countplot(plant_df['product_name'])
plt.title('Frequency of most popular products',fontsize = 30)
plt.xticks(rotation = 90 )
plt.show()
plant_df2 = plant_df[["order_hour_of_day","order_dow"]].mean()
plant_df2
plt.figure(figsize=(12,8))
sns.countplot(x="order_dow", data=plant_df)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Day of week', fontsize=12)
plt.xticks(rotation='vertical')
plt.title("Frequency of plant-based orders by day of week", fontsize=15)
plt.show()
plt.figure(figsize=(12,8))
sns.countplot(x="order_hour_of_day", data=plant_df)
plt.ylabel('Count', fontsize=12)
plt.xlabel('Hour of day', fontsize=12)
plt.xticks(rotation='vertical')
plt.title("Frequency of plant-based orders by hour of day", fontsize=15)
plt.show()
plant_products = plant_df['product_name'].value_counts()
plant_products
Banana 472565
Bag of Organic Bananas 379450
Organic Strawberries 264683
Organic Baby Spinach 241921
Organic Hass Avocado 213584
from wordcloud import WordCloud, STOPWORDS
text = plant_products
wordcloud = WordCloud(
width = 3000,
height = 2000,
background_color = 'white',
stopwords = STOPWORDS).generate(str(text))
fig = plt.figure(
figsize = (20, 20),
facecolor = 'k',
edgecolor = 'k')
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules
pb_counts = plant_df.groupby('product_id')['order_id'].count().reset_index().rename(columns = {'order_id':'frequency'})
pb_counts = plant_df.sort_values('frequency', ascending=False)[0:100].reset_index(drop = True)
pb_counts = plant_df.merge(products, on = 'product_id', how = 'left')
pb_counts.head(10)
freq_products = list(product_counts.product_id)
freq_products[1:10]
pb_products = plant_df[plant_df.product_id.isin(freq_products)]
pb_products.shape
pb_products = pb_products.merge(products, on = 'product_id', how='left')
pb_products.head()
basket_analysis = plant_df.groupby(['order_id', 'product_name'])['reordered'].count().unstack().reset_index().fillna(0).set_index('order_id')
basket_analysis.head()
def encode_units(x):
if x <= 0:
return 0
if x >= 1:
return 1
basket_analysis = basket_analysis.applymap(encode_units)
basket_analysis.head()
basket_analysis.size
basket_analysis.shape
frequent_items = apriori(basket_analysis, min_support=0.01, use_colnames=True)
frequent_items.head()
frequent_items.tail()
frequent_items.shape
Apriori Algorithm:
Apriori algorithm assumes that any subset of a frequent itemset must be frequent.
Association Rule Mining:
a technique to identify underlying relations between different items
used when we want to find an associations between different items or find frequent patterns in a dataset
Matrices
Support: the number of transactions containing product X divided by the total number of transactions
Confidence: Measures how often items in Y appear in transactions that contain X
Conviction: A high conviction value means that the consequent is highly depending on the antecedent
rules = association_rules(frequent_items, metric="lift", min_threshold=1)
rules.sort_values('lift', ascending=False)
import random
support = rules['support']
confidence = rules['confidence']
for i in range (len(support)):
support[i] = support[i] + 0.0025 * (random.randint(1,10) - 5)
confidence[i] = confidence[i] + 0.0025 * (random.randint(1,10) - 5)
plt.scatter(support, confidence, alpha=0.5, marker="*")
plt.title("Association of plant-based products")
plt.xlabel('support')
plt.ylabel('confidence')
plt.show()
Banana 472565
Bag of Organic Bananas 379450
Organic Strawberries 264683
Organic Baby Spinach 241921
Organic Hass Avocado 213584
(Organic Strawberries, Organic Blueberries)
(Organic Garlic, Organic Yellow Onion)
(Organic Raspberries, Organic Hass Avocado)
(Organic Strawberries, Organic Hass Avocado)
(Organic Strawberries, Organic Raspberries)
%%shell
jupyter nbconvert --to html /content/Fuentes_Final_Project.ipynb