Positive COVID-19 Reports in Florida Schools for 2021
Data Wrangling and Data Cleaning for export to Tableau
Notebook Created by: David Rusho (Github Blog | Tableau | Linkedin)
Tableau Dashboard Preview: FL School's Covid-19 Cases
About the Data
This Data comes from the Florida Department of Health. They release several reports throughout the years that list the number of positive covid-19 cases by school. The types of schools include Elementary, Middle, High School, Charter/Prepatory Schools, and University/Colleges. The data is presented on a pdf, is in wide format, and the school types are note labeled.
Also, the version of Tableau that is being used is Tableau Public. This means that the data that is exported will need to be in an excel format.
import pandas as pd
#import pdf and filter cols/rows
df = pd.read_csv('schools_latest.csv',usecols=[0,1,12,13,14,15,16,17,18,19],skiprows=5)
df.sample(2)
#rename columns
df.columns=['School','County','Total_Cases','Students','Teachers','Staff','Unknown','Symptoms_Yes','Symptoms_No','Symptoms_Unknown']
df.sample(2)
# new df with nan values removed
df2 = df1[df1['County'].isna()==False]
df2.sample(3)
df2.isna().sum()
df3 = df2[df2.Total_Cases.isna()==False]
df3.sample(3)
#df is balanced with nan values removed
df3.info()
# No way to identify which goups symptoms belong to
df4 = df3.drop(columns=['Symptoms_Yes','Symptoms_No','Symptoms_Unknown']).reset_index(drop=True)
df4.head(3)
df4['School_County'] = (df4.School.str.cat(df4.County, sep=" (") + " County)").str.title()
df4.head()
df_long = df4.melt(id_vars=['School','County','School_County','Total_Cases'],
var_name='Individual',
value_name='pos_covid')
df_long.sample(3)
# Drop '_Total Cases_' col
df_long2 = df_long.drop(columns='Total_Cases')
df_long2.head(2)
#Title Case for Schools
df_long2.School = df_long2.School.str.title()
df_long2.head(2)
#Label University/Colleges
df_long2.loc[((df_long2.School.str.contains('University')==True) | (df_long2.School.str.contains('College')==True)) &
(df_long2.School.str.contains('Middle')==False) &
(df_long2.School.str.contains('Elementary')==False) &
(df_long2.School.str.contains('High')==False) &
(df_long2.School.str.contains('Academy')==False) &
(df_long2.School.str.contains('Charter')==False) &
(df_long2.School.str.contains('Prep')==False) &
(df_long2.School.str.contains('Preparatory')==False) &
(df_long2.School.str.contains('1st College')==False) &
(df_long2.School.str.contains('Kids')==False) &
(df_long2.School.str.contains('Little College')==False), 'School_Type'] = 'University/College'
df_long2[df_long2['School'].str.contains('Uni')].sample(3)
#Label High Schools
df_long2.loc[((df_long2.School.str.contains('High')==True) | (df_long2.School.str.contains('Hs')==True)) &
(df_long2.School.str.contains('Middle')==False) &
(df_long2.School.str.contains('Elementary')==False) &
(df_long2.School.str.contains('Adult')==False),'School_Type'] = 'High School'
df_long2[df_long2['School'].str.contains('High')].sample(3)
#Identify Middle Schools
df_long2.loc[(df_long2.School.str.contains('High')==False) &
(df_long2.School.str.contains('Middle')==True) &
(df_long2.School.str.contains('Elementary')==False) &
(df_long2.School.str.contains('Adult')==False),'School_Type'] = 'Middle School'
df_long2[df_long2['School'].str.contains('Middle')].sample(3)
#identify Elementary Schools
#some elementary schools have 'University' or 'College' in their names
df_long2.loc[(df_long2.School.str.contains('High')==False) &
(df_long2.School.str.contains('Middle')==False) &
(df_long2.School.str.contains('Elementary')==True) &
(df_long2.School.str.contains('Adult')==False),'School_Type'] = "Elementary School"
df_long2[df_long2['School'].str.contains('Elem')].sample(3)
df_long2.loc[df_long2.School_Type.isna(),'School_Type'] = 'No Label'
df_long2[df_long2['School'].str.contains('Hs')]
#Change dtype of pos_covid to 'int'
df_long2.pos_covid = df_long2.pos_covid.str.replace(',','').astype('int')
df_long2.info()
#grouby school type
df_long2.groupby('School_Type').count()
df_long2.groupby(['School_Type','Individual'])['pos_covid'].sum().to_frame().reset_index().
# groupby county and postive covid cases
df_long2.groupby('County')['pos_covid'].sum().to_frame().reset_index().sort_values(by='pos_covid',ascending=False).head(15)
#collapse
# groupby one county and individuals
var = 'Orange'
sel_county = df_long2[df_long2['County']==var]
sel_county.groupby(['County','Individual'])['pos_covid'].sum().to_frame().reset_index()
df_long2 = df_long2.sort_values(by=['County','School']).reset_index(drop=True)
df_long2.to_excel('Florida_Schools_COVID19_Cases_2020_2021.xls')