Using SQL to Analyze Iowa Liquor Sales
Exploring Public Datasets in Google BigQuery
- About the Data
- Google BigQuery (SQL Queries)
- Count Table Rows
- Total Sales by Years
- Sales by Date (Top 5)
- (2012-2020) Sales by Weekday
- (2012-2020) Sales by Month
- (2012-2020) Sales by Day for December (Top 5)
- (2012-2020) Sales by Day for June (Top 5)
- (2012-2020) Sales by City
- (2012-2020) Sales by County
- (2012-2020) Sales by Vendor (Top 5)
- (2012-2020) Sales by Item (Top 5)
- (2012-2020) Sales by Item and Category (Top 5)
- Sales by Item and Category for the Year 2020
- Sales by Item and Category for the Past 10 Years (Top 10)
- References
Notebook Created by: David Rusho (Github Blog | Tableau | Linkedin)
About the Data
Data Source: data.iowa.gov
Provided by: Iowa Department of Commerce, Alcoholic Beverages Division
Data Created: November 7, 2014
Last Updated: July 1, 2021
This dataset contains the spirits purchase information of Iowa Class āEā liquor licensees by product and date of purchase from January 1, 2014 to current. The dataset can be used to analyze total spirits sales in Iowa of individual products at the store level.
Class E liquor license, for grocery stores, liquor stores, convenience stores, etc., allows commercial establishments to sell liquor for off-premises consumption in original unopened containers.
# query: count table rows and shows years
row_count = pd.io.gbq.read_gbq(
f"""
SELECT
COUNT(*) as total_rows
FROM {dataloc}
LIMIT 1""",
project_id=pjt_id,
)
row_count
yrs = pd.io.gbq.read_gbq(
f"""SELECT
DISTINCT(FORMAT_DATE('%Y', date)) AS years,
ROUND(SUM(sale_dollars),2) AS total_sales
FROM {dataloc}
GROUP BY
years
ORDER BY
years DESC
LIMIT 9""",
project_id=pjt_id,)
yrs.sort_values(by='years',inplace=True)
yrs["total_sales"] = round(yrs["total_sales"]/1000000,2)
fig = px.line(yrs, x="years", y="total_sales",
text="total_sales",
line_shape='spline')
fig.update_traces(textposition="bottom right")
fig.update_layout(
title={
"text": f"Total Sales (in millions) by Year",
"y": 0.98,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
yaxis_title="")
fig.show()
# query: top 5 sales by date
sum_sales_date = pd.io.gbq.read_gbq(
f"""
SELECT
date,
FORMAT_DATE('%A', date) AS day_name,
ROUND(SUM(sale_dollars),2) AS total_sales,
FROM
{dataloc}
GROUP BY
date
ORDER BY
total_sales DESC
""",
project_id=pjt_id,)
sum_sales_date.head(5)
# query: top 5 sales by weekday name
sum_sales_wkday = pd.io.gbq.read_gbq(
f"""
SELECT
FORMAT_DATE('%A', date) AS day,
ROUND(SUM(sale_dollars),2) AS total_sales,
FROM
{dataloc}
GROUP BY
day
ORDER BY
total_sales DESC
""",
project_id=pjt_id,
)
# create bar chart
fig = px.bar(sum_sales_wkday, y="total_sales", x="day", text="total_sales")
# update bar markers
fig.update_traces(textposition="outside", marker_color="rgb(47,138,196)") # blue color
# update plot details
fig.update_layout(
{"plot_bgcolor": "rgba(255,255,255, 0.9)"}, # white background
title={
"text": f"Total Liquor Sales (2012-2020) by Weekday",
"y": 0.98,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
yaxis_title="",
xaxis={
"categoryarray": [
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
"Sunday",
]
}, # remove axis titles
)
fig.show()
# query: top 5 sales by month name
sum_sales_mth = pd.io.gbq.read_gbq(
f"""
SELECT
FORMAT_DATE('%B', date) AS month,
ROUND(SUM(sale_dollars),2) AS total_sales,
FROM
{dataloc}
GROUP BY
month
ORDER BY
total_sales DESC
""",
project_id=pjt_id,
)
# create bar chart
fig = px.bar(sum_sales_mth, x="total_sales", y="month", text="total_sales")
months = [
"December",
"November",
"October",
"September",
"August",
"July",
"June",
"May",
"April",
"March",
"February",
"January",
]
# update bar markers
fig.update_traces(textposition="inside", marker_color="rgb(47,138,196)") # blue color
# update plot details
fig.update_layout(
{"plot_bgcolor": "rgba(255,255,255, 0.9)"}, # white background
title={
"text": f"Total Liquor Sales (2012-2020) by Month",
"y": 0.98,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
yaxis_title="",
yaxis={"categoryarray": months}, # remove axis titles
)
fig.show()
# query: top 5 sales by month (December)
dec_sum_sales = pd.io.gbq.read_gbq(
f"""
SELECT
EXTRACT(DAY FROM date) AS day,
ROUND(SUM(sale_dollars),2) AS total_sales,
FROM
{dataloc}
WHERE
FORMAT_DATE('%B', date) = 'December'
GROUP BY
day
ORDER BY
total_sales DESC
""",
project_id=pjt_id,)
dec_sum_sales.head(5)
# query: top 5 sales by month (June)
june_sum_sales = pd.io.gbq.read_gbq(
f"""
SELECT
EXTRACT(DAY FROM date) AS day,
ROUND(SUM(sale_dollars),2) AS total_sales,
FROM
{dataloc}
WHERE
FORMAT_DATE('%B', date) = 'June'
GROUP BY
day
ORDER BY
total_sales DESC
""",
project_id=pjt_id,)
june_sum_sales.head(5)
# query: top 5 sales by city
city_sum_sales = pd.io.gbq.read_gbq(
f"""
SELECT
LOWER(city) AS city,
ROUND(CAST(SUM(sale_dollars) AS numeric),0) AS total_sales,
FROM
{dataloc}
GROUP BY
city
ORDER BY
total_sales DESC
""",
project_id=pjt_id,)
# title case for city col
city_sum_sales.city = city_sum_sales.city.str.title()
#convert total_sales to float
city_sum_sales.total_sales = city_sum_sales.total_sales.astype('float')
# city_sum_sales.head()
# create bar chart
fig = px.bar(city_sum_sales.head(10), y="total_sales", x="city",text="total_sales")
# update bar markers
fig.update_traces(textposition="outside",
marker_color="rgb(81,162,213)") # blue color
# update plot details
fig.update_layout(
{"plot_bgcolor": "rgba(255,255,255, 0.9)"}, # white background
title={
"text": f"Total Liquor Sales (2012-2020) by City",
"y": 0.98,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
yaxis_title="",
)
fig.show()
# query: top 5 sales by county
county_sum_sales = pd.io.gbq.read_gbq(
f"""
SELECT
LOWER(county) AS county,
ROUND(CAST(SUM(sale_dollars) AS numeric),0) AS total_sales,
FROM
{dataloc}
GROUP BY
county
ORDER BY
total_sales DESC
""",
project_id=pjt_id,)
# title case for county col
county_sum_sales.county = county_sum_sales.county.str.title()
#convert total_sales to float
county_sum_sales.total_sales = county_sum_sales.total_sales.astype('float')
# county_sum_sales.head()
# create bar chart
fig = px.bar(county_sum_sales.head(10), y="total_sales", x="county",text="total_sales")
# update bar markers
fig.update_traces(textposition="outside",
marker_color="rgb(81,162,213)") # blue color
# update plot details
fig.update_layout(
{"plot_bgcolor": "rgba(255,255,255, 0.9)"}, # white background
title={
"text": f"Total Liquor Sales (2012-2020) by County",
"y": 0.98,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
yaxis_title="",
)
fig.show()
# query: top 5 sales by vendor
vendor_sum_sales = pd.io.gbq.read_gbq(
f"""
SELECT
LOWER(vendor_name) AS vendor,
ROUND(CAST(SUM(sale_dollars) AS numeric),0) AS total_sales
FROM
{dataloc}
GROUP BY
vendor
ORDER BY
total_sales DESC
""",
project_id=pjt_id,)
# title case for vendor col
vendor_sum_sales.vendor = vendor_sum_sales.vendor.str.title()
#convert total_sales to float
vendor_sum_sales.total_sales = vendor_sum_sales.total_sales.astype('float')
vendor_sum_sales.head()
# query: top 5 sales by item
item_sum_sales = pd.io.gbq.read_gbq(
f"""
SELECT
LOWER(item_description) AS item,
ROUND(CAST(SUM(sale_dollars) AS numeric),0) AS total_sales
FROM
{dataloc}
GROUP BY
item
ORDER BY
total_sales DESC
""",
project_id=pjt_id,)
# title case for item col
item_sum_sales.item = item_sum_sales.item.str.title()
#convert total_sales to float
item_sum_sales.total_sales = item_sum_sales.total_sales.astype('float')
item_sum_sales.head()
# query: top 5 sales by item & category
item_cat_sum_sales = pd.io.gbq.read_gbq(
f"""
SELECT
LOWER(item_description) AS item,
LOWER(category_name) AS category,
ROUND(CAST(SUM(sale_dollars) AS numeric),0) AS total_sales
FROM
{dataloc}
GROUP BY
item,category
ORDER BY
total_sales DESC
""",
project_id=pjt_id,)
# title case for item col
item_cat_sum_sales.item = item_cat_sum_sales.item.str.title()
# title case for category col
item_cat_sum_sales.category = item_cat_sum_sales.category.str.title()
#convert total_sales to float
item_cat_sum_sales.total_sales = item_cat_sum_sales.total_sales.astype('float')
item_cat_sum_sales.head()
# query: top 5 sales by item & category for 2020
item_cat_sum_sales_2020 = pd.io.gbq.read_gbq(
f"""
SELECT
FORMAT_DATE('%Y', date) AS year,
LOWER(item_description) AS item,
LOWER(category_name) AS category,
ROUND(CAST(SUM(sale_dollars) AS numeric),0) AS total_sales
FROM
{dataloc}
WHERE
FORMAT_DATE('%Y', date) LIKE ('%2020%')
GROUP BY
item,
category,
year
ORDER BY
total_sales DESC
""",
project_id=pjt_id,)
# title case for item col
item_cat_sum_sales_2020.item = item_cat_sum_sales_2020.item.str.title()
# title case for category col
item_cat_sum_sales_2020.category = item_cat_sum_sales_2020.category.str.title()
#convert total_sales to float
item_cat_sum_sales_2020.total_sales = item_cat_sum_sales_2020.total_sales.astype('float')
item_2020 = (
item_cat_sum_sales_2020.sort_values(by="total_sales", ascending=False)
.head(10)
.copy()
)
item_2020 = item_2020.sort_values(by="total_sales", ascending=True)
fig = px.bar(
item_2020,
x="total_sales",
y="item",
orientation="h",
text="total_sales",
color="category",
)
# update bar markers
fig.update_traces(textposition="inside")
# update plot details
fig.update_layout(
{"plot_bgcolor": "rgba(255,255,255, 0.9)"}, # white background
title={
"text": f"2020 Sales by Item & Category ",
"y": 0.98,
"x": 0.5,
"xanchor": "center",
"yanchor": "top",
},
yaxis_title="" #remove axis titles
)
fig.show()
# query: top 5 sales by item & category for past 10 years
item_cat_sum_sales = pd.io.gbq.read_gbq(
f"""
SELECT
FORMAT_DATE('%Y', date) AS year,
LOWER(item_description) AS item,
LOWER(category_name) AS category,
ROUND(CAST(SUM(sale_dollars) AS numeric),0) AS total_sales
FROM
{dataloc}
GROUP BY
item,
category,
year
ORDER BY
total_sales DESC
""",
project_id=pjt_id,)
# title case for item col
item_cat_sum_sales.item = item_cat_sum_sales.item.str.title()
# title case for category col
item_cat_sum_sales.category = item_cat_sum_sales.category.str.title()
#convert total_sales to float
item_cat_sum_sales.total_sales = item_cat_sum_sales.total_sales.astype('float')
item_cat_sum_sales.head(10)
References
Google Cloud Docs: SQL Date Formating