#!unzip -o InstaCartFinalProject.zip


#!csvstack order_products*.csv >  orderproduct.csv


!csvcut -n orderproduct.csv

  1: order_id
  2: product_id
  3: add_to_cart_order
  4: reordered


!csvcut -n order_products__prior.csv

  1: order_id
  2: product_id
  3: add_to_cart_order
  4: reordered


!csvcut -n order_products__train.csv

  1: order_id
  2: product_id
  3: add_to_cart_order
  4: reordered


!head -n 10000 aisles.csv | csvstat

  1. "aisle_id"

	Type of data:          Number
	Contains null values:  False
	Non-null values:       134
	Unique values:         134
	Smallest value:        1
	Largest value:         134
	Sum:                   9045
	Mean:                  67.5
	Median:                67.5
	StDev:                 38.827
	Most decimal places:   0
	Most common values:    1 (1x)
	                       2 (1x)
	                       3 (1x)
	                       4 (1x)
	                       5 (1x)

  2. "aisle"

	Type of data:          Text
	Contains null values:  False
	Non-null values:       134
	Unique values:         134
	Longest value:         29 characters
	Most common values:    prepared soups salads (1x)
	                       specialty cheeses (1x)
	                       energy granola bars (1x)
	                       instant foods (1x)
	                       marinades meat preparation (1x)

Row count: 134


!head -n 10000 departments.csv | csvstat

  1. "department_id"

	Type of data:          Number
	Contains null values:  False
	Non-null values:       21
	Unique values:         21
	Smallest value:        1
	Largest value:         21
	Sum:                   231
	Mean:                  11
	Median:                11
	StDev:                 6.205
	Most decimal places:   0
	Most common values:    1 (1x)
	                       2 (1x)
	                       3 (1x)
	                       4 (1x)
	                       5 (1x)

  2. "department"

	Type of data:          Text
	Contains null values:  False
	Non-null values:       21
	Unique values:         21
	Longest value:         15 characters
	Most common values:    frozen (1x)
	                       other (1x)
	                       bakery (1x)
	                       produce (1x)
	                       alcohol (1x)

Row count: 21


!head -n 10000 orders.csv | csvstat

  1. "order_id"

	Type of data:          Number
	Contains null values:  False
	Non-null values:       9999
	Unique values:         9999
	Smallest value:        40
	Largest value:         3420572
	Sum:                   17120524678
	Mean:                  1712223.69
	Median:                1694267
	StDev:                 988604.717
	Most decimal places:   0
	Most common values:    2539329 (1x)
	                       2398795 (1x)
	                       473747 (1x)
	                       2254736 (1x)
	                       431534 (1x)

  2. "user_id"

	Type of data:          Number
	Contains null values:  False
	Non-null values:       9999
	Unique values:         633
	Smallest value:        1
	Largest value:         633
	Sum:                   3127903
	Mean:                  312.822
	Median:                309
	StDev:                 177.328
	Most decimal places:   0
	Most common values:    210 (100x)
	                       310 (100x)
	                       313 (100x)
	                       626 (95x)
	                       516 (94x)

  3. "eval_set"

	Type of data:          Text
	Contains null values:  False
	Non-null values:       9999
	Unique values:         3
	Longest value:         5 characters
	Most common values:    prior (9367x)
	                       train (407x)
	                       test (225x)

  4. "order_number"

	Type of data:          Number
	Contains null values:  False
	Non-null values:       9999
	Unique values:         100
	Smallest value:        1
	Largest value:         100
	Sum:                   163779
	Mean:                  16.38
	Median:                10
	StDev:                 17.364
	Most decimal places:   0
	Most common values:    1 (633x)
	                       2 (633x)
	                       3 (633x)
	                       4 (633x)
	                       5 (565x)

  5. "order_dow"

	Type of data:          Number
	Contains null values:  False
	Non-null values:       9999
	Unique values:         7
	Smallest value:        0
	Largest value:         6
	Sum:                   27648
	Mean:                  2.765
	Median:                3
	StDev:                 2.03
	Most decimal places:   0
	Most common values:    1 (1853x)
	                       0 (1666x)
	                       5 (1414x)
	                       2 (1376x)
	                       3 (1251x)

  6. "order_hour_of_day"

	Type of data:          Number
	Contains null values:  False
	Non-null values:       9999
	Unique values:         24
	Smallest value:        0
	Largest value:         23
	Sum:                   134420
	Mean:                  13.443
	Median:                13
	StDev:                 4.191
	Most decimal places:   0
	Most common values:    10 (874x)
	                       11 (865x)
	                       12 (856x)
	                       14 (839x)
	                       13 (818x)

  7. "days_since_prior_order"

	Type of data:          Number
	Contains null values:  True (excluded from calculations)
	Non-null values:       9366
	Unique values:         32
	Smallest value:        0
	Largest value:         30
	Sum:                   105227
	Mean:                  11.235
	Median:                8
	StDev:                 9.21
	Most decimal places:   0
	Most common values:    30 (1092x)
	                       7 (973x)
	                       6 (678x)
	                       4 (667x)
	                       None (633x)

Row count: 9999


!head -n 10000 products.csv | csvstat

  1. "product_id"

	Type of data:          Number
	Contains null values:  False
	Non-null values:       9999
	Unique values:         9999
	Smallest value:        1
	Largest value:         9999
	Sum:                   49995000
	Mean:                  5000
	Median:                5000
	StDev:                 2886.607
	Most decimal places:   0
	Most common values:    1 (1x)
	                       2 (1x)
	                       3 (1x)
	                       4 (1x)
	                       5 (1x)

  2. "product_name"

	Type of data:          Text
	Contains null values:  False
	Non-null values:       9999
	Unique values:         9999
	Longest value:         153 characters
	Most common values:    Chocolate Sandwich Cookies (1x)
	                       All-Seasons Salt (1x)
	                       Robust Golden Unsweetened Oolong Tea (1x)
	                       Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce (1x)
	                       Green Chile Anytime Sauce (1x)

  3. "aisle_id"

	Type of data:          Number
	Contains null values:  False
	Non-null values:       9999
	Unique values:         134
	Smallest value:        1
	Largest value:         134
	Sum:                   685543
	Mean:                  68.561
	Median:                72
	StDev:                 38.133
	Most decimal places:   0
	Most common values:    100 (240x)
	                       47 (223x)
	                       45 (219x)
	                       37 (214x)
	                       120 (201x)

  4. "department_id"

	Type of data:          Number
	Contains null values:  False
	Non-null values:       9999
	Unique values:         21
	Smallest value:        1
	Largest value:         21
	Sum:                   116610
	Mean:                  11.662
	Median:                13
	StDev:                 5.876
	Most decimal places:   0
	Most common values:    11 (1334x)
	                       19 (1217x)
	                       13 (1052x)
	                       7 (908x)
	                       1 (860x)

Row count: 9999


!head -n 10000 orderproduct.csv | csvstat

  1. "order_id"

	Type of data:          Number
	Contains null values:  False
	Non-null values:       9999
	Unique values:         977
	Smallest value:        2
	Largest value:         1022
	Sum:                   5224852
	Mean:                  522.537
	Median:                529
	StDev:                 294.524
	Most decimal places:   0
	Most common values:    775 (46x)
	                       742 (45x)
	                       454 (44x)
	                       970 (39x)
	                       315 (38x)

  2. "product_id"

	Type of data:          Number
	Contains null values:  False
	Non-null values:       9999
	Unique values:         4511
	Smallest value:        23
	Largest value:         49683
	Sum:                   256879703
	Mean:                  25690.539
	Median:                25230
	StDev:                 14087.017
	Most decimal places:   0
	Most common values:    24852 (156x)
	                       13176 (119x)
	                       21137 (73x)
	                       21903 (70x)
	                       47209 (68x)

  3. "add_to_cart_order"

	Type of data:          Number
	Contains null values:  False
	Non-null values:       9999
	Unique values:         46
	Smallest value:        1
	Largest value:         46
	Sum:                   82038
	Mean:                  8.205
	Median:                6
	StDev:                 6.765
	Most decimal places:   0
	Most common values:    1 (977x)
	                       2 (940x)
	                       3 (895x)
	                       4 (831x)
	                       5 (767x)

  4. "reordered"

	Type of data:          Boolean
	Contains null values:  False
	Non-null values:       9999
	Unique values:         2
	Most common values:    True (6065x)
	                       False (3934x)

Row count: 9999


!head -n 10 orderproduct.csv | csvlook

| order_id | product_id | add_to_cart_order | reordered |
| -------- | ---------- | ----------------- | --------- |
|        2 |     33,120 |                 1 |      True |
|        2 |     28,985 |                 2 |      True |
|        2 |      9,327 |                 3 |     False |
|        2 |     45,918 |                 4 |      True |
|        2 |     30,035 |                 5 |     False |
|        2 |     17,794 |                 6 |      True |
|        2 |     40,141 |                 7 |      True |
|        2 |      1,819 |                 8 |      True |
|        2 |     43,668 |                 9 |     False |


!head -n 10 orders.csv | csvlook

|  order_id | user_id | eval_set | order_number | order_dow | order_hour_of_day | days_since_prior_order |
| --------- | ------- | -------- | ------------ | --------- | ----------------- | ---------------------- |
| 2,539,329 |    True | prior    |            1 |         2 |                 8 |                        |
| 2,398,795 |    True | prior    |            2 |         3 |                 7 |                     15 |
|   473,747 |    True | prior    |            3 |         3 |                12 |                     21 |
| 2,254,736 |    True | prior    |            4 |         4 |                 7 |                     29 |
|   431,534 |    True | prior    |            5 |         4 |                15 |                     28 |
| 3,367,565 |    True | prior    |            6 |         2 |                 7 |                     19 |
|   550,135 |    True | prior    |            7 |         1 |                 9 |                     20 |
| 3,108,588 |    True | prior    |            8 |         1 |                14 |                     14 |
| 2,295,261 |    True | prior    |            9 |         1 |                16 |                      0 |


!head -n 10 products.csv | csvlook

| product_id | product_name                                                      | aisle_id | department_id |
| ---------- | ----------------------------------------------------------------- | -------- | ------------- |
|          1 | Chocolate Sandwich Cookies                                        |       61 |            19 |
|          2 | All-Seasons Salt                                                  |      104 |            13 |
|          3 | Robust Golden Unsweetened Oolong Tea                              |       94 |             7 |
|          4 | Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce |       38 |             1 |
|          5 | Green Chile Anytime Sauce                                         |        5 |            13 |
|          6 | Dry Nose Oil                                                      |       11 |            11 |
|          7 | Pure Coconut Water With Orange                                    |       98 |             7 |
|          8 | Cut Russet Potatoes Steam N' Mash                                 |      116 |             1 |
|          9 | Light Strawberry Blueberry Yogurt                                 |      120 |            16 |


!head -n 10 aisles.csv | csvlook

| aisle_id | aisle                      |
| -------- | -------------------------- |
|        1 | prepared soups salads      |
|        2 | specialty cheeses          |
|        3 | energy granola bars        |
|        4 | instant foods              |
|        5 | marinades meat preparation |
|        6 | other                      |
|        7 | packaged meat              |
|        8 | bakery desserts            |
|        9 | pasta sauce                |


!head -n 10 departments.csv | csvlook

| department_id | department      |
| ------------- | --------------- |
|             1 | frozen          |
|             2 | other           |
|             3 | bakery          |
|             4 | produce         |
|             5 | alcohol         |
|             6 | international   |
|             7 | beverages       |
|             8 | pets            |
|             9 | dry goods pasta |


!csvclean aisles.csv
!csvclean departments.csv
!csvclean orders.csv
!csvclean products.csv
!csvclean orderproduct.csv

No errors.
No errors.
No errors.
No errors.
No errors.


!dropdb -U postgres finalp


!createdb -U postgres finalp


from IPython.display import Image


%load_ext sql


%sql postgresql://postgres@/finalp


%%sql
DROP TABLE IF EXISTS aisle;

CREATE TABLE aisle (
    aisle_id INTEGER NOT NULL,
    aisle VARCHAR(50) NOT NULL
)


%%sql
COPY aisle FROM  '/home/ubuntu/FinalAssignment/aisles.csv'
CSV
HEADER;


!wc -l aisles.csv

135 aisles.csv


%%sql
SELECT * FROM aisle
LIMIT 10


%%sql
SELECT aisle_id, COUNT(*) AS id_count
FROM aisle
GROUP BY aisle_id
HAVING COUNT(*) > 1;


%%sql
ALTER TABLE aisle
ADD COLUMN aisle_key SERIAL,
ADD CONSTRAINT pk_aisle_key PRIMARY KEY (aisle_key);


%%sql
SELECT * FROM aisle
LIMIT 10


%%sql
DROP TABLE IF EXISTS department;

CREATE TABLE department (
    department_id INTEGER NOT NULL,
    department VARCHAR(50) NOT NULL
)


%%sql
COPY department FROM  '/home/ubuntu/FinalAssignment/departments.csv'
CSV
HEADER;


!wc -l departments.csv

22 departments.csv


%%sql
SELECT * FROM department
LIMIT 10


%%sql
SELECT department_id, COUNT(*) AS id_count
FROM department
GROUP BY department_id
HAVING COUNT(*) > 1;


%%sql
ALTER TABLE department
ADD COLUMN department_key SERIAL,
ADD CONSTRAINT pk_department_key PRIMARY KEY (department_key);


%%sql
SELECT * FROM department
LIMIT 10


%%sql
DROP TABLE IF EXISTS orders;

CREATE TABLE orders (
    order_id INTEGER NOT NULL,
    user_id INTEGER NOT NULL,
    eval_set VARCHAR(15) NOT NULL,
    order_number INTEGER NOT NULL,
    order_dow INTEGER NOT NULL,
    hour_of_day INTEGER NOT NULL,
    days_since_last_order NUMERIC NULL
)


%%sql
COPY orders FROM  '/home/ubuntu/FinalAssignment/orders.csv'
CSV
HEADER;


!wc -l orders.csv

3421084 orders.csv


%%sql
SELECT * FROM orders
LIMIT 10


%%sql
SELECT order_id, COUNT(*) AS id_count
FROM orders
GROUP BY order_id
HAVING COUNT(*) > 1;


%%sql
ALTER TABLE orders
ADD COLUMN order_key SERIAL,
ADD CONSTRAINT pk_order_key PRIMARY KEY (order_key);


%%sql
SELECT * FROM orders
LIMIT 10


%%sql
DROP TABLE IF EXISTS product;

CREATE TABLE product (
    product_id INTEGER NOT NULL,
    name VARCHAR(200) NOT NULL,
    aisle_id INTEGER NOT NULL,
    department_id INTEGER NOT NULL
)


%%sql
COPY product FROM  '/home/ubuntu/FinalAssignment/products.csv'
CSV
HEADER;


!wc -l products.csv

49689 products.csv


%%sql
SELECT * FROM product
LIMIT 10


%%sql
SELECT product_id, COUNT(*) AS id_count
FROM product
GROUP BY product_id
HAVING COUNT(*) > 1;


%%sql
ALTER TABLE product
ADD COLUMN product_key SERIAL,
ADD CONSTRAINT pk_product_key PRIMARY KEY (product_key);


%%sql
SELECT * FROM product
LIMIT 10


%%sql
DROP TABLE IF EXISTS orderproduct;

CREATE TABLE orderproduct (
    order_id INTEGER NOT NULL,
    product_id INTEGER NOT NULL,
    current_items_cart INTEGER NOT NULL,
    reordered BOOLEAN NOT NULL
)


%%sql
COPY orderproduct FROM  '/home/ubuntu/FinalAssignment/orderproduct.csv'
CSV
HEADER;


!wc -l orderproduct.csv

33819107 orderproduct.csv


%%sql
SELECT * FROM orderproduct
LIMIT 10


%%sql
SELECT COUNT(*) FROM aisle;


%%sql
SELECT COUNT(*) FROM department;


%%sql
SELECT COUNT(*) FROM orders;


%%sql
SELECT COUNT(*) FROM product;


%%sql
SELECT COUNT(*) FROM orderproduct;


Image(filename="/home/ubuntu/FinalAssignment/table_structure1.png")


%%sql
ALTER TABLE orderproduct
ADD product_key INTEGER;


%%sql
CREATE INDEX idx_orderproduct_product_id ON orderproduct(product_id);
CREATE INDEX idx_product_product_id ON product(product_id);
CREATE INDEX idx_orderproduct_product_key ON orderproduct(product_key);
CREATE INDEX idx_product_product_key ON product(product_key);


%%sql
UPDATE orderproduct
SET product_key = product.product_key
FROM product
WHERE orderproduct.product_id = product.product_id;


%%sql
SELECT * FROM orderproduct
LIMIT 10;


%%sql
SELECT COUNT(*)
FROM orderproduct
WHERE product_key IS NULL;


%%sql
ALTER TABLE orderproduct
ADD CONSTRAINT fk_product_key FOREIGN KEY (product_key) REFERENCES product(product_key);


%%sql
ALTER TABLE orderproduct
DROP COLUMN product_id;


%%sql
ALTER TABLE orderproduct
ADD aisle_key INTEGER,
ADD department_key INTEGER;


%%sql
CREATE INDEX idx_product_aisle_id ON product(aisle_id);
CREATE INDEX idx_product_department_id ON product(department_id);
CREATE INDEX idx_aisle_aisle_id ON aisle(aisle_id);
CREATE INDEX idx_department_department_id ON department(department_id);
CREATE INDEX idx_orderproduct_aisle_key ON orderproduct(aisle_key);
CREATE INDEX idx_aisle_aisle_key ON aisle(aisle_key);
CREATE INDEX idx_orderproduct_department_key ON orderproduct(department_key);
CREATE INDEX idx_department_department_key ON department(department_key);


%%sql

UPDATE orderproduct
SET aisle_key = aisle.aisle_key
FROM product
INNER JOIN aisle ON product.aisle_id = aisle.aisle_id
WHERE orderproduct.product_key = product.product_key;

UPDATE orderproduct
SET department_key = department.department_key
FROM product
INNER JOIN department ON product.department_id = department.department_id
WHERE orderproduct.product_key = product.product_key;


%%sql
SELECT * FROM orderproduct
LIMIT 10;


%%sql
SELECT COUNT(*)
FROM orderproduct
WHERE aisle_key IS NULL OR department_key IS NULL;


%%sql
ALTER TABLE orderproduct
ADD CONSTRAINT fk_aisle_key FOREIGN KEY (aisle_key) REFERENCES aisle(aisle_key),
ADD CONSTRAINT fk_department_key FOREIGN KEY (department_key) REFERENCES department(department_key);


%%sql
ALTER TABLE product
DROP COLUMN aisle_id,
DROP COLUMN department_id;


Image(filename="/home/ubuntu/FinalAssignment/table_structure2.png")


%%sql
ALTER TABLE orderproduct
ADD order_key INTEGER;


%%sql
CREATE INDEX idx_orderproduct_order_id ON orderproduct(order_id);
CREATE INDEX idx_orders_order_id ON orders(order_id);
CREATE INDEX idx_orderproduct_order_key ON orderproduct(order_key);
CREATE INDEX idx_orders_order_key ON orders(order_key);


%%sql
UPDATE orderproduct
SET order_key = orders.order_key
FROM orders
WHERE orderproduct.order_id = orders.order_id;


%%sql
SELECT * FROM orderproduct
LIMIT 10;


%%sql
SELECT COUNT(*)
FROM orderproduct
WHERE order_key IS NULL;


%%sql
ALTER TABLE orderproduct
ADD CONSTRAINT fk_order_key FOREIGN KEY (order_key) REFERENCES orders(order_key);


%%sql
ALTER TABLE orderproduct
DROP COLUMN order_id;


Image(filename="/home/ubuntu/FinalAssignment/table_structure3.png")


%%sql
DROP TABLE IF EXISTS time;

CREATE TABLE time (
    time_key SERIAL PRIMARY KEY,
    order_dow INTEGER,
    hour_of_day INTEGER
);


%%sql
INSERT INTO time (order_dow, hour_of_day)
SELECT DISTINCT order_dow, hour_of_day
FROM orders
ORDER BY order_dow, hour_of_day;


%%sql
SELECT * FROM time
LIMIT 10;


%%sql
ALTER TABLE orderproduct
ADD COLUMN time_key INTEGER;


%%sql
CREATE INDEX idx_orders_order_dow ON orders(order_dow);
CREATE INDEX idx_orders_hour_of_day ON orders(hour_of_day);
CREATE INDEX idx_time_order_dow ON time(order_dow);
CREATE INDEX idx_time_hour_of_day ON time(hour_of_day);
CREATE INDEX idx_orderproduct_time_key ON orderproduct(time_key);
CREATE INDEX idx_time_time_key ON time(time_key);


%%sql
UPDATE orderproduct
SET time_key = time.time_key
FROM orders
INNER JOIN time ON orders.order_dow = time.order_dow AND orders.hour_of_day = time.hour_of_day
WHERE orderproduct.order_key = orders.order_key;


%%sql
SELECT * FROM orderproduct
LIMIT 10;


%%sql
SELECT COUNT(*)
FROM orderproduct
WHERE time_key IS NULL;


%%sql
ALTER TABLE orderproduct
ADD CONSTRAINT fk_time_key FOREIGN KEY (time_key) REFERENCES time(time_key);


%%sql
ALTER TABLE orders
DROP COLUMN order_dow,
DROP COLUMN hour_of_day;


Image(filename="/home/ubuntu/FinalAssignment/table_structure4.png")


%%sql
DROP TABLE IF EXISTS users;

CREATE TABLE users (
    user_key SERIAL PRIMARY KEY,
    user_id INTEGER
);


%%sql
INSERT INTO users (user_id)
SELECT DISTINCT user_id
FROM orders
ORDER BY user_id;


%%sql
SELECT * FROM users
LIMIT 10;


%%sql
ALTER TABLE orderproduct
ADD COLUMN user_key INTEGER;


%%sql
CREATE INDEX idx_orders_user_id ON orders(user_id);
CREATE INDEX idx_users_user ON users(user_id);
CREATE INDEX idx_orderproduct_user_key ON orderproduct(user_key);
CREATE INDEX idx_users_user_key ON users(user_key);


%%sql
UPDATE orderproduct
SET user_key = users.user_key
FROM orders
INNER JOIN users ON orders.user_id = users.user_id
WHERE orderproduct.order_key = orders.order_key;


%%sql
SELECT * FROM orderproduct
LIMIT 10;


%%sql
SELECT COUNT(*)
FROM orderproduct
WHERE user_key IS NULL;


%%sql
ALTER TABLE orderproduct
ADD CONSTRAINT fk_user_key FOREIGN KEY (user_key) REFERENCES users(user_key);


%%sql
ALTER TABLE orders
DROP COLUMN user_id;


Image(filename="/home/ubuntu/FinalAssignment/table_structure5.png")


%%sql
SELECT COUNT(*) FROM orderproduct;


!wc -l orderproduct.csv

33819107 orderproduct.csv


Image(filename="/home/ubuntu/FinalAssignment/star_schema.png")


%matplotlib inline


%%sql
SELECT
    p.name, 
    COUNT(*) AS total_products_sold
FROM orderproduct op
INNER JOIN product p ON op.product_key = p.product_key
GROUP BY p.name
ORDER BY total_products_sold DESC
LIMIT 10;


%%sql
SELECT 
    d.department,
    COUNT(*) AS total_products_sold
FROM orderproduct op
INNER JOIN department d ON op.department_key = d.department_key
GROUP BY d.department
ORDER BY total_products_sold DESC
LIMIT 10;


_.bar()

/home/ubuntu/.python3-venv/lib/python3.12/site-packages/sql/run/resultset.py:366: UserWarning: .bar() is deprecated and will be removed in a future version. Use %sqlplot bar instead. For more help, find us at https://ploomber.io/community 
  warnings.warn(

<AxesSubplot: xlabel='department', ylabel='total_products_sold'>


import psycopg2
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Connect to the PostgreSQL database (only providing dbname and user)
conn = psycopg2.connect("dbname='finalp' user='postgres'")
c = conn.cursor()

# SQL query to count orders by department and hour of the day for the top 10 departments
query = """
    WITH department_order_counts AS (
        SELECT 
            d.department,
            COUNT(*) AS order_count
        FROM orderproduct op
        INNER JOIN department d ON op.department_key = d.department_key
        GROUP BY d.department
        ORDER BY order_count DESC
        LIMIT 10
    )
    SELECT 
        d.department,
        time.hour_of_day,
        COUNT(*) AS order_count
    FROM orderproduct op
    INNER JOIN department d ON op.department_key = d.department_key
    INNER JOIN time ON op.time_key = time.time_key
    WHERE d.department IN (SELECT department FROM department_order_counts)
    GROUP BY d.department, time.hour_of_day
    ORDER BY d.department, time.hour_of_day;
"""

# Execute the query
c.execute(query)

# Fetch the results
rows = c.fetchall()

c.close()
conn.close()

# Load the data into a Pandas DataFrame
df = pd.DataFrame(rows, columns=["Department", "Hour of Day", "Order Count"])

# Pivot the data for plotting (department on rows, hour of day on columns)
df_pivot = df.pivot(index="Hour of Day", columns="Department", values="Order Count").fillna(0)

# Plot the data as a line graph
plt.figure(figsize=(14, 8))
df_pivot.plot(kind='line', marker='o', figsize=(14, 7))  # Plot as a line graph with markers

# Add title and labels
plt.title('Count of Product Sales by Top 10 Departments and Hour of Day', fontsize=16)
plt.xlabel('Hour of Day', fontsize=12)
plt.ylabel('Product Sales', fontsize=12)

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Display the legend on the right
plt.legend(title='Department', bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust layout for better spacing
plt.tight_layout()

# Show the plot
plt.show()

<Figure size 1400x800 with 0 Axes>


# Connect to the PostgreSQL database (only providing dbname and user)
conn = psycopg2.connect("dbname='finalp' user='postgres'")
c = conn.cursor()

# SQL query to count orders by department and day of the week (order_dow) for the top 10 departments
query = """
    WITH department_order_counts AS (
        SELECT 
            d.department,
            COUNT(*) AS order_count
        FROM orderproduct op
        INNER JOIN department d ON op.department_key = d.department_key
        GROUP BY d.department
        ORDER BY order_count DESC
        LIMIT 10
    )
    SELECT 
        d.department,
        time.order_dow,
        COUNT(*) AS order_count
    FROM orderproduct op
    INNER JOIN department d ON op.department_key = d.department_key
    INNER JOIN time ON op.time_key = time.time_key
    WHERE d.department IN (SELECT department FROM department_order_counts)
    GROUP BY d.department, time.order_dow
    ORDER BY d.department, time.order_dow;
"""

# Execute the query
c.execute(query)

# Fetch the results
rows = c.fetchall()

c.close()
conn.close()

# Load the data into a Pandas DataFrame
df = pd.DataFrame(rows, columns=["Department", "Day of Week", "Order Count"])

# Pivot the data for plotting (department on columns, day of week on rows)
df_pivot = df.pivot(index="Day of Week", columns="Department", values="Order Count").fillna(0)

# Plot the data as a line graph
plt.figure(figsize=(14, 8))
df_pivot.plot(kind='line', marker='o', figsize=(14, 7))  # Plot as a line graph with markers

# Add title and labels
plt.title('Count of Product Sales by Top 10 Departments and Day of Week', fontsize=16)
plt.xlabel('Day of Week', fontsize=12)
plt.ylabel('Product Sales', fontsize=12)

# Replace x-axis tick labels with the actual names of the days
plt.xticks(ticks=range(7), labels=['Sat', 'Sun', 'Mon', 'Tue', 'Wed', 'Thu', 'Fri'], rotation=45)

# Display the legend on the right
plt.legend(title='Department', bbox_to_anchor=(1.05, 1), loc='upper left')

# Adjust layout for better spacing
plt.tight_layout()

# Show the plot
plt.show()

<Figure size 1400x800 with 0 Axes>


%%sql
SELECT 
    d.department,
    ROUND(SUM(CASE WHEN op.reordered = TRUE THEN 1 ELSE 0 END) * 1.0 / COUNT(*), 2) AS reorder_rate
FROM orderproduct op
INNER JOIN department d ON op.department_key = d.department_key
GROUP BY d.department
ORDER BY reorder_rate DESC
LIMIT 10;


# Connect to the PostgreSQL database (only providing dbname and user)
conn = psycopg2.connect("dbname='finalp' user='postgres'")
c = conn.cursor()

# SQL query to calculate the reorder rate by department
query = """
    SELECT 
        d.department,
        ROUND(SUM(CASE WHEN op.reordered = TRUE THEN 1 ELSE 0 END) * 1.0 / COUNT(*), 2) AS reorder_rate
    FROM orderproduct op
    INNER JOIN department d ON op.department_key = d.department_key
    GROUP BY d.department
    ORDER BY reorder_rate DESC
    LIMIT 10;
"""

# Execute the query
c.execute(query)

# Fetch the results
rows = c.fetchall()

# SQL query to calculate the average reorder rate for all products
avg_query = """
    SELECT 
        ROUND(SUM(CASE WHEN op.reordered = TRUE THEN 1 ELSE 0 END) * 1.0 / COUNT(*), 2) AS avg_reorder_rate
    FROM orderproduct op;
"""
# Execute the average reorder rate query
c.execute(avg_query)

# Fetch the result
avg_row = c.fetchone()
avg_reorder_rate = avg_row[0]

c.close()
conn.close()

# Load the data into a Pandas DataFrame
df = pd.DataFrame(rows, columns=["Department", "Reorder Rate"])

# Plot the data as a vertical bar graph
plt.figure(figsize=(10, 6))
plt.bar(df["Department"], df["Reorder Rate"], color='skyblue')

# Set y-axis range from 0.5 to 0.7
plt.ylim(0.55, 0.7)

# Add a reference line for the average reorder rate
plt.axhline(y=avg_reorder_rate, color='red', linestyle='--', label=f'Avg Reorder Rate: {avg_reorder_rate}')

# Add titles and labels
plt.title('Top 10 Departments by Reorder Rate', fontsize=16)
plt.xlabel('Department', fontsize=12)
plt.ylabel('Reorder Rate', fontsize=12)

# Rotate x-axis labels for better readability (make them horizontal)
plt.xticks(rotation=45, ha='right')

# Show the plot
plt.tight_layout()
plt.show()


%%sql
SELECT 
    p.name AS product_name,
    d.department AS department_name,
    COUNT(*) AS order_count,
    ROUND(SUM(CASE WHEN op.reordered = TRUE THEN 1 ELSE 0 END) * 1.0 / COUNT(*), 2) AS reorder_rate
FROM orderproduct op
INNER JOIN product p ON op.product_key = p.product_key
INNER JOIN department d ON op.department_key = d.department_key
GROUP BY p.name, d.department
ORDER BY reorder_rate DESC
LIMIT 10;


%%sql
SELECT 
    p.name AS product_name,
    d.department AS department_name,
    COUNT(*) AS order_count,
    ROUND(SUM(CASE WHEN op.reordered = TRUE THEN 1 ELSE 0 END) * 1.0 / COUNT(*), 2) AS reorder_rate
FROM orderproduct op
INNER JOIN product p ON op.product_key = p.product_key
INNER JOIN department d ON op.department_key = d.department_key
GROUP BY p.name, d.department
HAVING COUNT(*) >= 10000
ORDER BY reorder_rate DESC, order_count DESC
LIMIT 10;


%%sql
SELECT 
    CASE 
        WHEN p.name ILIKE '%Milk%' THEN 'Milk'
        WHEN p.name ILIKE '%Yogurt%' THEN 'Yogurt'
        WHEN p.name ILIKE '%Banana%' THEN 'Banana'
        WHEN p.name ILIKE '%Avocado%' THEN 'Avocado'
        WHEN p.name ILIKE '%Eggs%' THEN 'Eggs'
    END AS product_type,
    COUNT(DISTINCT op.product_key) AS product_count,  -- Count the number of distinct products in each category
    ROUND(SUM(CASE WHEN op.reordered = TRUE THEN 1 ELSE 0 END) * 1.0 / COUNT(*), 2) AS reorder_rate,
    ROUND(AVG(o.days_since_last_order), 2) AS avg_days_since_last_order  -- Calculate the average days since last order
FROM orderproduct op
INNER JOIN product p ON op.product_key = p.product_key
INNER JOIN orders o ON op.order_key = o.order_key  -- Join with orders to get days_since_last_order
WHERE 
    p.name ILIKE '%Milk%' OR
    p.name ILIKE '%Yogurt%' OR
    p.name ILIKE '%Banana%' OR
    p.name ILIKE '%Avocado%' OR
    p.name ILIKE '%Eggs%'
GROUP BY product_type
ORDER BY reorder_rate DESC;


%%sql
WITH reordered_pairs AS (
    SELECT
        op1.product_key AS product_key_1,
        op2.product_key AS product_key_2
    FROM orderproduct op1
    INNER JOIN orderproduct op2 ON op1.order_key = op2.order_key
    WHERE op1.product_key < op2.product_key  -- Ensure no duplicate pairs (avoid reversing pairs)
        AND op1.reordered = TRUE
        AND op2.reordered = TRUE
)
SELECT
    p1.name AS product_name_1,
    p2.name AS product_name_2,
    COUNT(*) AS reorder_count
FROM reordered_pairs rp
INNER JOIN product p1 ON rp.product_key_1 = p1.product_key
INNER JOIN product p2 ON rp.product_key_2 = p2.product_key
GROUP BY p1.name, p2.name
ORDER BY reorder_count DESC
LIMIT 10;


%%sql
SELECT 
    p.product_id,
    p.name AS product_name,
    COUNT(*) AS product_sales,
    ROUND(AVG(o.days_since_last_order), 2) AS avg_days_since_last_order
FROM orderproduct op
INNER JOIN orders o ON op.order_key = o.order_key
INNER JOIN product p ON op.product_key = p.product_key
GROUP BY p.product_id, p.name
HAVING COUNT(*) >= 10000
ORDER BY avg_days_since_last_order
LIMIT 10;


%%sql
SELECT
    d.department_id,
    d.department AS department_name,
    COUNT(*) AS product_sales,
    ROUND(AVG(o.days_since_last_order), 2) AS avg_days_since_last_order
FROM orderproduct op
INNER JOIN orders o ON op.order_key = o.order_key
INNER JOIN department d ON op.department_key = d.department_key
GROUP BY d.department_id, d.department
ORDER BY avg_days_since_last_order
LIMIT 10;


# Connect to the PostgreSQL database
conn = psycopg2.connect("dbname='finalp' user='postgres'")
c = conn.cursor()

# SQL query to get the top 10 products with avg_days_since_last_order
query = """
    SELECT 
        p.product_id,
        p.name AS product_name,
        COUNT(*) AS product_sales,
        ROUND(AVG(o.days_since_last_order), 2) AS avg_days_since_last_order
    FROM orderproduct op
    INNER JOIN orders o ON op.order_key = o.order_key
    INNER JOIN product p ON op.product_key = p.product_key
    GROUP BY p.product_id, p.name
    HAVING COUNT(*) >= 10000
    ORDER BY avg_days_since_last_order
    LIMIT 10;
"""

# Execute the query
c.execute(query)

# Fetch the results
rows = c.fetchall()

c.close()
conn.close()

# Load the data into a Pandas DataFrame
df = pd.DataFrame(rows, columns=["Product ID", "Product Name", "Product Sales", "Avg Days Since Last Order"])

# Plot the data as a bar chart
plt.figure(figsize=(10, 6))
plt.barh(df["Product Name"], df["Avg Days Since Last Order"], color='skyblue')

# Add title and labels
plt.title('Top 10 Products by Average Days Since Last Order', fontsize=16)
plt.xlabel('Average Days Since Last Order', fontsize=12)
plt.ylabel('Product Name', fontsize=12)

# Set x-axis range between 8 and 10
plt.xlim(8, 10)
# Invert the y-axis to display the lowest values at the top
plt.gca().invert_yaxis()

# Adjust layout for better spacing
plt.tight_layout()

# Show the plot
plt.show()


# Connect to the PostgreSQL database
conn = psycopg2.connect("dbname='finalp' user='postgres'")
c = conn.cursor()

# SQL query to get the top 10 departments with avg_days_since_last_order
query = """
     SELECT
        d.department_id,
        d.department AS department_name,
        COUNT(*) AS product_sale,
        ROUND(AVG(o.days_since_last_order), 2) AS avg_days_since_last_order
    FROM orderproduct op
    INNER JOIN orders o ON op.order_key = o.order_key
    INNER JOIN department d ON op.department_key = d.department_key
    GROUP BY d.department_id, d.department
    ORDER BY avg_days_since_last_order
    LIMIT 10;
"""

# Execute the query
c.execute(query)

# Fetch the results
rows = c.fetchall()

# Load the data into a Pandas DataFrame
df = pd.DataFrame(rows, columns=["Department ID", "Department Name", "Product Sales", "Avg Days Since Last Order"])

# SQL query to calculate the overall average days since last order for all products
avg_query = """
    SELECT 
        ROUND(AVG(o.days_since_last_order), 2) AS avg_days_since_last_order
    FROM orders o;
"""

# Execute the average query
c.execute(avg_query)

# Fetch the result
avg_row = c.fetchone()
overall_avg_days_since_last_order = avg_row[0]

c.close()
conn.close()

# Plot the data as a horizontal bar chart
plt.figure(figsize=(10, 6))
plt.barh(df["Department Name"], df["Avg Days Since Last Order"], color='skyblue')

# Add the reference line for the overall average
plt.axvline(x=overall_avg_days_since_last_order, color='red', linestyle='--', label=f'Avg Days Across All Products: {overall_avg_days_since_last_order}')

# Add titles and labels
plt.title('Top 10 Departments by Average Days Since Last Order', fontsize=16)
plt.xlabel('Average Days Since Last Order', fontsize=12)
plt.ylabel('Department Name', fontsize=12)

# Set x-axis range (adjust based on your data)
plt.xlim(10, 12)

# Invert the y-axis to display the lowest values at the top
plt.gca().invert_yaxis()

# Add a legend to label the reference line
plt.legend()

# Adjust layout for better spacing
plt.tight_layout()

# Show the plot
plt.show()


%%sql
SELECT 
    p.name AS product_name,
    COUNT(*) AS product_sales
FROM orderproduct op
INNER JOIN product p ON op.product_key = p.product_key
INNER JOIN department d ON op.department_key = d.department_key
WHERE d.department = 'missing'
GROUP BY p.name
ORDER BY product_sales DESC
LIMIT 10;

name	total_products_sold
Banana	491291
Bag of Organic Bananas	394930
Organic Strawberries	275577
Organic Baby Spinach	251705
Organic Hass Avocado	220877
Organic Avocado	184224
Large Lemon	160792
Strawberries	149445
Limes	146660
Organic Whole Milk	142813

department	total_products_sold
produce	9888378
dairy eggs	5631067
snacks	3006412
beverages	2804175
frozen	2336858
pantry	1956819
bakery	1225181
canned goods	1114857
deli	1095540
dry goods pasta	905340

department	reorder_rate
dairy eggs	0.67
produce	0.65
beverages	0.65
bakery	0.63
deli	0.61
pets	0.60
bulk	0.58
babies	0.58
alcohol	0.57
meat seafood	0.57

product_name	department_name	order_count	reorder_rate
Raw Veggie Wrappers	deli	69	0.94
Serenity Ultimate Extrema Overnight Pads	personal care	90	0.93
Orange Energy Shots	beverages	13	0.92
Chocolate Love Bar	snacks	102	0.92
Simply Sleep Nighttime Sleep Aid	other	45	0.91
Soy Powder Infant Formula	babies	35	0.91
Energy Shot, Grape Flavor	beverages	22	0.91
Soy Crisps Lightly Salted	snacks	67	0.90
Bars Peanut Butter	pantry	69	0.90
Russian River Valley Reserve Pinot Noir	alcohol	30	0.90

product_name	department_name	order_count	reorder_rate
Banana	produce	491291	0.85
Organic Reduced Fat Milk	dairy eggs	36869	0.85
Milk, Organic, Vitamin D	dairy eggs	20770	0.85
Organic Lowfat 1% Milk	dairy eggs	15352	0.84
Organic Milk Reduced Fat, 2% Milkfat	dairy eggs	13119	0.84
Organic Whole Milk	dairy eggs	10102	0.84
Bag of Organic Bananas	produce	394930	0.83
Organic Whole Milk	dairy eggs	142813	0.83
Organic Fat Free Milk	dairy eggs	27402	0.83
0% Greek Strained Yogurt	dairy eggs	13651	0.83

Final Project: Group 13¶

Ask 1¶

Dataset description:¶

Dataset source¶

Why this dataset?¶

Is suitable for dimensional modeling and analytical analysis?¶

Understand relationships¶

Analytical questions we want to answer¶

Concerns with the data¶

Ask 2: Wrangling the data¶

Step 1: Data Loading¶

1. Load the essential packages and connect to SQL¶

2. Create tables and load data¶

Step 2: Data Cleaning and Modeling¶

1. Link orderproduct and product table with product_key:¶

2. Link orderproduct and aisle/department table with aisle_key/department_key:¶

3. Link orderproduct and orders table with order_key:¶

4. The orders table has both user and time information, extracting time as another dimension:¶

5. Extracting user_id as another dimension:¶

6. Check to make sure our new orderproduct table has the same number of rows as our original orderproduct.csv file:¶

Final Step - We now have a star schema like this:¶

Ask 3: Data Analysis¶

Question 1: What products and departments are selling the most products? How does it vary by time of day and week, if at all?¶

Question 2: Which products and departments have the highest re-order rates?¶

Question 3: What products and departments are bought from the most frequently?¶

aisle_id	aisle
1	prepared soups salads
2	specialty cheeses
3	energy granola bars
4	instant foods
5	marinades meat preparation
6	other
7	packaged meat
8	bakery desserts
9	pasta sauce
10	kitchen supplies

order_id	user_id	eval_set	order_number	order_dow	hour_of_day	days_since_last_order
2539329	1	prior	1	2	8	None
2398795	1	prior	2	3	7	15.0
473747	1	prior	3	3	12	21.0
2254736	1	prior	4	4	7	29.0
431534	1	prior	5	4	15	28.0
3367565	1	prior	6	2	7	19.0
550135	1	prior	7	1	9	20.0
3108588	1	prior	8	1	14	14.0
2295261	1	prior	9	1	16	0.0
2550362	1	prior	10	4	8	30.0

product_id	name	aisle_id	department_id
1	Chocolate Sandwich Cookies	61	19
2	All-Seasons Salt	104	13
3	Robust Golden Unsweetened Oolong Tea	94	7
4	Smart Ones Classic Favorites Mini Rigatoni With Vodka Cream Sauce	38	1
5	Green Chile Anytime Sauce	5	13
6	Dry Nose Oil	11	11
7	Pure Coconut Water With Orange	98	7
8	Cut Russet Potatoes Steam N' Mash	116	1
9	Light Strawberry Blueberry Yogurt	120	16
10	Sparkling Orange Juice & Prickly Pear Beverage	115	7

order_id	product_id	current_items_cart	reordered
2	33120	1	True
2	28985	2	True
2	9327	3	False
2	45918	4	True
2	30035	5	False
2	17794	6	True
2	40141	7	True
2	1819	8	True
2	43668	9	False
3	33754	1	True

order_id	product_id	current_items_cart	reordered	product_key
1107	1	7	False	1
5319	1	3	True	1
9273	1	30	False	1
9696	1	5	True	1
11140	1	1	True	1
9228	1	2	False	1
7540	1	4	True	1
11485	1	4	False	1
12672	1	3	True	1
23202	1	3	False	1

order_id	current_items_cart	reordered	product_key	aisle_key	department_key
2252923	14	False	36089	108	16
2250213	20	False	36089	108	16
2251148	2	True	36089	108	16
2252031	14	False	36089	108	16
2247300	8	True	36089	108	16
2248306	10	True	36089	108	16
2251443	10	False	36089	108	16
2247560	11	False	36089	108	16
2252685	7	False	36089	108	16
2251800	4	False	36089	108	16

current_items_cart	reordered	product_key	aisle_key	department_key	order_key	time_key
4	True	13419	109	11	317643	66
14	True	13426	105	13	17280	7
4	False	13427	45	19	1387488	43
3	False	13427	45	19	200462	81
7	False	13427	45	19	3395429	167
2	True	13434	105	13	1488274	112
4	True	13434	105	13	2374979	138
2	True	13434	105	13	2982761	159
10	True	13434	105	13	96169	106
27	True	13434	105	13	1103217	12

current_items_cart	reordered	product_key	aisle_key	department_key	order_key	time_key	user_key
7	False	28224	26	7	1554087	83	93344
3	True	28226	52	1	2749470	104	165704
7	False	28226	52	1	1531973	11	92013
1	True	28228	96	20	3250774	42	195937
16	True	28248	123	4	2594428	133	156190
16	True	23898	131	9	705041	11	42472
7	False	23898	131	9	3118632	40	188029
47	True	23909	84	16	1095289	14	65992
5	True	23909	84	16	1811471	40	108790
2	True	23909	84	16	478824	29	28858

product_type	product_count	reorder_rate	avg_days_since_last_order
Banana	302	0.81	10.72
Avocado	88	0.74	10.99
Eggs	124	0.71	11.75
Milk	1310	0.71	10.89
Yogurt	1010	0.68	11.13

product_name_1	product_name_2	reorder_count
Bag of Organic Bananas	Organic Hass Avocado	51954
Bag of Organic Bananas	Organic Strawberries	49675
Organic Strawberries	Banana	43062
Banana	Organic Avocado	41557
Organic Baby Spinach	Banana	39927
Bag of Organic Bananas	Organic Baby Spinach	39446
Bag of Organic Bananas	Organic Raspberries	32811
Organic Strawberries	Organic Hass Avocado	31657
Banana	Large Lemon	29995
Strawberries	Banana	29982

product_id	product_name	product_sales	avg_days_since_last_order
31720	Organic Whole Milk	10102	8.92
29447	Milk, Organic, Vitamin D	20770	9.47
329	Organic Whole Grassmilk Milk	15939	9.52
31915	Organic Red Delicious Apple	14334	9.58
43122	Organic Bartlett Pear	38686	9.62
13166	Organic Milk Reduced Fat, 2% Milkfat	13119	9.64
27744	Organic Whipping Cream	11002	9.67
8174	Organic Navel Orange	44701	9.68
1244	Organic Orange Bell Pepper	11005	9.71
1463	Organic Milk	28850	9.73

department_id	department_name	product_sales	avg_days_since_last_order
18	babies	438743	10.21
10	bulk	35932	10.34
5	alcohol	159294	10.58
21	missing	77396	10.75
4	produce	9888378	11.11
19	snacks	3006412	11.20
16	dairy eggs	5631067	11.23
2	other	38086	11.24
7	beverages	2804175	11.31
3	bakery	1225181	11.35

product_name	product_sales
Organic Riced Cauliflower	9189
Peanut Butter Ice Cream Cup	2194
Organic Celery Bunch	1845
Organic Mango Yogurt	1804
Natural Lemon Flavored Sparkling Water	1618
Cookie Dough Ice Cream Chocolate Chip	1506
Organic Whole Milk Washington Black Cherry Yogurt	1506
Soy & Dairy Free Plain Unsweetened Almond Milk Yogurt	1105
Uncured Beef Hot Dog	1072
Organic Nondairy Strawberry Cashew Yogurt	987