Exploratory Data Analysis: Cardiovascular Disease and Related Health Variables — Python Code

13 min readOct 25, 2023

This analysis uses S3, Pandas, NumPy, Seaborn, Matplotlib, and Plotly

Access codes from the S3 bucket have been removed

The code block below imports libraries and loads files from S3 into a Python dictionary. Most of the original CDC data files are stored as SAS transport files(.XPT). These files are read into Python data frames.

import sys
import boto3
import tempfile
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import warnings
#Python 3.10.7 produces warnings with sns categorical variable; therefore, ignore warnings. Python 3.9 does not produce warnings
warnings.filterwarnings("ignore")

def update_progress_bar(i, num_files):
    """Input: i integer (num files iterated through), num_files (total number of files in S3 bucket)
    Function: Compute bar # = completed progress, - = incomplete progress, int rounding of progress
    Output: Temporary print progress bar"""

    progress = i/num_files
    bar_length = 50
    filled_length = int(round(bar_length * progress))

    bar = f"[{'#' * filled_length}{'-' * (bar_length - filled_length)}] {int(progress * 100)}%"
    sys.stdout.write(f"\r{bar}")
    sys.stdout.flush()

#S3 bucket can be accessed publicly using the following keys and specificed bucket name
access_key = ''
secret_key = ''
bucket_name = '593milestone1' #Prefix not necessary as all files are in root path

#Initialize the S3 client with explicit credentials
s3 = boto3.client('s3', aws_access_key_id=access_key, aws_secret_access_key=secret_key)

#List objects in the root of the S3 bucket
response = s3.list_objects_v2(Bucket=bucket_name)

#Initialize an empty list to store dataframes, num_files in S3 bucket and i to use later for progress bar
dataframe_dict = {}
num_files = len(response.get('Contents', []))
i=1

#Iterate through the objects in the bucket
for obj in response.get('Contents', []):
    #Get the object key (file path)
    file_key = obj['Key']
    name = file_key[:-4]
    
    #Create a temporary file to store the data
    with tempfile.NamedTemporaryFile() as temp_file:
        #Download the object (file) from S3 to the temporary file
        s3.download_file(Bucket=bucket_name, Key=file_key, Filename=temp_file.name)
        
        #Specify options for loading dataframes based on file extension
        #We only have XPT and CSV to worry about
        try:
            #Read the XPT data from the temporary file into a pandas dataframe
            df = pd.read_sas(temp_file.name, format='xport', encoding='utf-8')
        except:
            #Read the CSV data from the temporary file into a pandas dataframe
            df = pd.read_csv(temp_file.name)
    
    #Append the dataframe to dictionary, with name specified as its key
    dataframe_dict[name] = df

    #Show updated progress bar, add 1 to i
    update_progress_bar(i, num_files)
    i+=1

The following code performs extensive data manipulation to create visualizations for the Cardiovascular Disease Analysis presentation.

The CDC’s Behavioral Risk Factor Surveillance System (BRFSS) dataset is modified to allow for better combination with the CDC’s National Health and Nutrition Examination Survey (NHANES) data set.

df22 = dataframe_dict['BRFSS_2022'].copy()
BRFSS_df = df22.copy()

demo_cols = ['Gender', 'Age Group', 'Income Group', 'Race']

#reassign age values to categorical values
BRFSS_df['Age Group'] = BRFSS_df['_AGE_G'].replace({1:'18 to 24', 2:'25 to 34', 3:'35 to 44', 4:'45 to 54', 5:'55 to 64', 6:'65 or older'})
# Define the desired order of the category column
age_category_order = ['18 to 24', '25 to 34','35 to 44','45 to 54','55 to 64','65 or older']
BRFSS_df['Age Group'] = pd.Categorical(BRFSS_df['Age Group'], categories=age_category_order, ordered=True)

#reassigned race
race_replace_dict = {1:'White', 2:'Black', 3:'Other Race or Multi-Racial', 4:'Asian', 5:'Other Race or Multi-Racial', 6:'Other Race or Multi-Racial', 7:'Hispanic'}
BRFSS_df['Race'] = BRFSS_df['_RACEPR1'].replace(race_replace_dict)
#order the category
race_category_order = ['White', 'Black', 'Asian', 'Hispanic', 'Other Race or Multi-Racial']
BRFSS_df['Race'] = pd.Categorical(BRFSS_df['Race'], categories=race_category_order, ordered=True)

#replace the gender category
BRFSS_df['Gender'] = BRFSS_df['SEXVAR'].replace({1:'Male', 2:'Female'})
#order the category
gender_category_order = ['Male', 'Female']
BRFSS_df['Gender'] = pd.Categorical(BRFSS_df['Gender'], categories=gender_category_order, ordered=True)

#replace the income category -- first make a dict for replacing
list_values = ['Less than $10,000','$10,000 < $15,000','$15,000 < $20,000', '$20,000 < $25,000', 
               '$25,000 < $35,000', '$35,000 < $50,000','$50,000 < $75,000','$75,000 < $100,000' ,
               '$100,000 < $150,000','$150,000 < $200,000','$200,000 or more','Dont know/Not sure', 'Refused', np.nan]
income_keys = sorted(list(BRFSS_df['INCOME3'].unique()))
income_replace_dict = dict(zip(income_keys, list_values))
#use dict to replace income
BRFSS_df['Income Group'] = BRFSS_df['INCOME3'].replace(income_replace_dict)
#order the income
income_category_order = ['Less than $10,000','$10,000 < $15,000','$15,000 < $20,000', '$20,000 < $25,000', 
                         '$25,000 < $35,000', '$35,000 < $50,000','$50,000 < $75,000','$75,000 < $100,000' ,
                         '$100,000 < $150,000','$150,000 < $200,000','$200,000 or more','Dont know/Not sure', 'Refused']
BRFSS_df['Income Group'] = pd.Categorical(BRFSS_df['Income Group'], categories=income_category_order, ordered=True)

#Income group is the only demographic group out of the four of interest that his missing data, drop Na values based on this variable
BRFSS_df = BRFSS_df.dropna(subset='Income Group')

columns_to_keep = ['_STATE'] + list(BRFSS_df.iloc[:,31:].columns)
df22_reduced = BRFSS_df[columns_to_keep]

#total weight used in BRFSS - needed for weighted national percentages/averages
total_weight = df22_reduced['_LLCPWT'].sum()

#ever had chronic health condition
list_comorbid_columns = ['CVDINFR4', 'CVDCRHD4', 'CVDSTRK3', 'ASTHMA3', 'ASTHNOW', 'CHCSCNC1', 'CHCOCNC1', 'CHCCOPD3', 'ADDEPEV3', 'CHCKDNY2','HAVARTH4', 'DIABETE4']
#rename the columns using these new names - from BRFSS codebook
new_comorbid_columns = ['Heart Attack', 'Heart Disease', 'Stroke', 'Asthma', 
                        'Current Asthma', 'Cancer (not melanoma)', 'Melanoma or other', 
                        'COPD or chronic Bronchitis', 'Depressive Disorder', 'Kidney Disease', 'Arthritis', 'Diabetes']
#rename dictionary
chronic_rename_dict = dict(zip(list_comorbid_columns, new_comorbid_columns))

#rename columns
df22_reduced = df22_reduced.rename(columns=chronic_rename_dict)

Multiple years of NHANES demographic and NHANES BMI data sets are combined into a single data set

#Initialize empty dataframes for 2008-2020 BMI NHANES, 2008-2020 demographic NHANES data
BMI_df = pd.DataFrame()
DEMO_timedf = pd.DataFrame()

#Iterate through dictionary of df names and df objects
for key, df in dataframe_dict.items():
    #If a "BMI" file, add year column based on title, concat to working BMI df
    if key.startswith('BMI') == True:
        year = str(20) + key[-2:]
        df['Year'] = year
        BMI_df = pd.concat([BMI_df, df])
    #If a "demo" file, add year column based on title, concat to working demo df
    if key.startswith('demo') == True:
        year = str(20) + key[-2:]
        df['Year'] = year
        DEMO_timedf = pd.concat([DEMO_timedf, df])

#Inner merge of demographic and BMI on SEQN and year, one large NHANES 2008-2020 dataframe
merged_timedf = DEMO_timedf.merge(BMI_df, on=['SEQN', 'Year'], how='inner')pyt

2017–2020 NHANES Data sets are combined for future analysis

#Specify 2017-2020 NHANES columns to keep
NHANES_cols = ['SEQN', 'RIAGENDR', 'RIDAGEYR', 'RIDRETH3', 'DMDEDUC2', 'INDFMPIR',
              'FSDHH', 'HIQ011', 'HUQ010', 'HUQ030','BPQ020','DIQ010', 'DID040', 'BPD035']

#Initialize empty list of NHANES df's, each to be added to
NHANES_dfs = []

for df_name, df in dataframe_dict.items():
    # Check if the last 5 characters of the key are "17_20"
    if df_name[-5:] == "17_20":
        # Append the matched NHANES df to the list of df's to merge later
        NHANES_dfs.append(df)

#Main df = combined_df, first df object in our list of NHANES
combined_df = NHANES_dfs[0]
#Iterate through remaining list of NAHNES df's
for df in NHANES_dfs[1:]:
    #merge dataframes on ID
    combined_df = pd.merge(combined_df, df, on='SEQN', how='inner')

#Keep only cols in NHANES_cols
combined_df = combined_df[NHANES_cols]

Recoding of NHANES data is performed for future analysis

#Recode variables
combined_df['RIDAGEYR'] = combined_df['RIDAGEYR'].astype(int)

#Using standardized age groups defined in BRFSS
age_bins = [18, 25, 35, 45, 55, 65, float('inf')]
age_labels = ['18-24', '25-34', '35-44', '45-54', '55-64', '65+']
combined_df['AGE_GROUP'] = pd.cut(combined_df['RIDAGEYR'], bins=age_bins, labels=age_labels, right=False)

#Other demographic variables
combined_df['RIAGENDR'] = combined_df['RIAGENDR'].replace({1: 'Male', 2: 'Female'})
race_eth_recode = {1: 'Hispanic', 2: 'Hispanic', 3: 'White', 4: 'Black', 6: 'Asian', 7: 'Other/Multi'}
combined_df['RIDRETH3'] = combined_df['RIDRETH3'].replace(race_eth_recode)
combined_df['DMDEDUC2'] = combined_df['DMDEDUC2'].apply(lambda x: 'College educated' if x in [4, 5] else 'No college education')
combined_df['INDFMPIR'] = combined_df['INDFMPIR'].apply(lambda x: 'At/above poverty line' if x > 1 else 'Below poverty line')

#Food security
food_security_recode = {1: 'Full food security', 2: 'Marginal food security', 3: 'Low food security', 4: 'Very low food security'}
combined_df['FSDHH'] = combined_df['FSDHH'].replace(food_security_recode)
food_security_order = ['Full food security','Marginal food security','Low food security','Very low food security']
combined_df['FSDHH'] = pd.Categorical(combined_df['FSDHH'], categories=food_security_order, ordered=True)

#Health insurance
hx_insurance_recode = {1: 'Has health insurance', 2: 'No health insurance', 7: np.nan, 9: np.nan}
combined_df['HIQ011'] = combined_df['HIQ011'].replace(hx_insurance_recode)

#Health condition, hospital utilization
hx_condition_recode = {1: 'Excellent', 2: 'Very good', 3: 'Good', 4: 'Fair', 5:'Poor', 7: np.nan, 9: np.nan}
combined_df['HUQ010'] = combined_df['HUQ010'].replace(hx_condition_recode)
hx_condition_order = ['Poor', 'Fair', 'Good', 'Very good', 'Excellent']
combined_df['HUQ010'] = pd.Categorical(combined_df['HUQ010'], categories=hx_condition_order, ordered=True)
combined_df['HUQ030'] = combined_df['HUQ030'].apply(lambda x: 'Routine care' if x in [1, 3] else 'No routine care')

#Diabetes/high blood pressure - will be used for dummy variables later
bp_recode = {1:'Yes', 2:'No', 7: np.nan, 9: np.nan}
combined_df['BPQ020'] = combined_df['BPQ020'].replace(bp_recode)
diabetes_recode = {1: 'Yes', 2:'No', 3:'Borderline', 7:np.nan, 9:np.nan}
combined_df['DIQ010'] = combined_df['DIQ010'].replace(diabetes_recode)

var_name_dict = {'AGE_GROUP': "Age Group", 'RIAGENDR': "Gender", 'RIDRETH3': "Race/Ethnicity", 'FSDHH': "Food Security",
                 'DMDEDUC2': "Education Level",'HIQ011': "Health Insurance Status",'HUQ030': "Health Care Utilization",
                 'INDFMPIR': "Income Relative to Poverty Line"}

Visualizations are performed below

Create a function to group BRFSS into demographic groups

def total_disease_group_divide_by_group_weight(df, column):
    '''Dataframe will be BRFSS-style dataframe, Data will be weighted to group percentage
    Column will be the demographic column to group by
    Returns a dataframe''' 

    #list of chronic disease that overlap 
    new_comorbid_columns = ['Heart Attack', 'Heart Disease', 'Stroke', 'Asthma', 
                            'Current Asthma', 'Cancer (not melanoma)', 'Melanoma or other', 
                            'COPD or chronic Bronchitis', 'Depressive Disorder', 'Kidney Disease', 'Arthritis', 'Diabetes']
    #find only columns with a 1 -- essentially create boolean mask and compare each column value to 1, True of False mask produced
    #columns to check
    condition1 = df[new_comorbid_columns].eq(1).any(axis=1).astype(int)
    df['At Least One Chronic Disease'] = condition1
    #take the sum of the weight column if the condition is met in the row
    #the number is quite high because it involved arthritis and depression
    sum_result_all_chronic = df.loc[df['At Least One Chronic Disease'] ==1].groupby(column)['_LLCPWT'].sum()

    #the total weight per group
    sum_group_all = df.groupby(column)['_LLCPWT'].sum()

    #find the percent of individuals in this group that have these conditions
    total_percent_chronic_disease = (sum_result_all_chronic/sum_group_all) * 100

    #reduce to only CVD related disease, diabetes, chronic coughing, asthma (these may be related to cardiovascular health -- see smoking cigarettes), kidney disease (related to diabetes issues)
    new_chronic_disease_columns = ['Heart Disease', 'Stroke', 'Asthma', 'COPD or chronic Bronchitis']

    #find only columns with a 1 -- essentially create boolean mask and compare each column value to 1, True of False mask produced
    #columns to check
    condition2 = df[new_chronic_disease_columns].eq(1).any(axis=1).astype(int)
    df['At Least One CVD Expanded Disease'] = condition2

    #take the sum of the weight column if the condition is met in the row
    sum_result_cvd_expanded = df.loc[df['At Least One CVD Expanded Disease']==1].groupby(column)['_LLCPWT'].sum()

    #the total weight per group
    sum_group_expanded = df.groupby(column)['_LLCPWT'].sum()

    #find the percent of individuals in this group that have these conditions
    total_percent_cvd_disease_expanded = ( sum_result_cvd_expanded/sum_group_expanded) * 100

    #reduce to only CVD related disease,
    new_cvd_disease_columns = ['Heart Attack', 'Heart Disease']
    #find only columns with a 1 -- essentially create boolean mask and compare each column value to 1, True of False mask produced --> as int
    #columns to check
    condition3 = df[new_cvd_disease_columns].eq(1).any(axis=1).astype(int)
    df['At Least One CVD Disease'] = condition3
    #take the sum of the weight column if the condition is met in the row
    sum_result_cvd = df.loc[df['At Least One CVD Disease'] ==1].groupby(column)['_LLCPWT'].sum()
    
    #the total weight per group
    sum_group_cvd = df.groupby(column)['_LLCPWT'].sum()

    #find the percent of individuals in this group that have these conditions
    total_percent_cvd_disease = (sum_result_cvd/sum_group_cvd) * 100




    data = {'All Chronic Disease':total_percent_chronic_disease, 'CVD Expanded':total_percent_cvd_disease_expanded, 'CVD Heart Attack/Stroke': total_percent_cvd_disease}
    return pd.DataFrame(data).reset_index()

Create a function to turn grouped data frame into a bar graph

def produce_barplot_from_dataframe(df, column):
    '''Take in a dataframe generated from total_disease_group
    Produce a bar plot, reference the column in previous function'''
    melted = df.melt(id_vars=column)
    sns.set(style="whitegrid")
    sns.barplot(data=melted, x='variable', y='value', hue=melted[column])
    plt.xlabel('Disease')
    plt.ylabel('Percent')
    plt.xticks(rotation=45)
    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    plt.title(f'National Chronic Disease Rate by {column}')
    plt.show()

Example Visualization

produce_barplot_from_dataframe(total_disease_group_divide_by_group_weight(df22_reduced, 'Income Group'), 'Income Group')

Iterate through demographic groups and produce bar graph for each demographic

for demographic in demo_cols:
  produce_barplot_from_dataframe(total_disease_group_divide_by_group_weight(df22_reduced, demographic), demographic)
  plt.show()

Create a function to produce bar graphs for binary variable and demographic groups

def get_mean_rate_age_binary_variable_graph(df,col):
    '''This will work with any low response binary where binary values 1 = yes, 2 = no'''
    #get mean value
    #replace missing/refused values
    replace_dict = {7: np.nan, 9: np.nan}
    df[col] = df[col].replace(replace_dict)

    df22_ageg = df.groupby(['Age Group'])[[col]].mean()

    #take mean to get percentage rate, but adjust values so that reassign variables relate to 0 = yes, 1 = no
    #subtract these values from 1 and multiply value to get percentage WITH heart disease
    df_age_percentage = (1-(df22_ageg-1))*100

    sns.barplot(data=df_age_percentage, x=df_age_percentage.index, y=col)
    plt.ylabel(f'{col} Percentage')
    plt.xlabel('Age')
    plt.title(f'{col} Rates by Age')
    plt.show()

get_mean_rate_age_binary_variable_graph(df22_reduced,'Heart Disease')

Create a function to generate a map for different chronic diseases

def create_map_disease_type(df, col):
    '''Pass in BRFSS-like dataframe, do not include categorical columns. Use renamed columns - Heart Attack, Stroke etc.
    returns a map based on the disease selected'''    
    df_maps = df.copy()
    df_maps['_STATE'] = df_maps['_STATE'].astype(int)
    state_codes = dataframe_dict['us-state-ansi-fips']


    #ever had chronic health condition
    list_comorbid_columns = ['CVDINFR4', 'CVDCRHD4', 'CVDSTRK3', 'ASTHMA3', 'ASTHNOW', 'CHCSCNC1', 'CHCOCNC1', 'CHCCOPD3', 'ADDEPEV3', 'CHCKDNY2','HAVARTH4', 'DIABETE4']
    #rename the columns using this dictionary
    new_comorbid_columns = ['Heart Attack', 'Heart Disease', 'Stroke', 'Asthma', 'Current Asthma', 'Cancer (not melanoma)', 'Melanoma or other', 'COPD or chronic Bronchitis', 'Depressive Disorder', 'Kidney Disease', 'Arthritis', 'Diabetes']
    #rename dictionary
    chronic_rename_dict = dict(zip(list_comorbid_columns, new_comorbid_columns))

    df_maps = df_maps.rename(columns=chronic_rename_dict)



    #adjust state code columns

    state_codes = state_codes.rename(columns={' st':'_STATE', ' stusps':'state_code'})
    #remove the white space in the abbreviations for the state code
    state_codes['state_code'] = state_codes['state_code'].str.strip()

    #replace missing/refused values
    replace_dict = {7: np.nan, 9: np.nan}
    df_maps[col] = df_maps[col].replace(replace_dict)

    #get total state weights for future analysis - divide weighted disease with the total weight for each state
    total_state_weight = df_maps.groupby('_STATE')[['_LLCPWT']].sum()
    total_state_weight.reset_index(inplace=True)

    #find weighted percentage for the individuals with heart disease

    #isolate those with disease
    df22_cvd = df_maps[df_maps[col] == 1]



    ##this is where the edits start
    #calculate their weighted value
    df22_cvd = df22_cvd.assign(Weighted_D =  df22_cvd[col] * df22_cvd['_LLCPWT'])
    
    # #groupby the state
    df22_grouped_sum = df22_cvd.groupby('_STATE')[['Weighted_D']].sum()

    #merge back total state weight
    df22_grouped_sum = df22_grouped_sum.merge(total_state_weight, on='_STATE', how='inner')

    #divide each state's weighted CVD value by the state's total weight
    df22_grouped_sum = df22_grouped_sum.assign(Percent_D = df22_grouped_sum['Weighted_D']/df22_grouped_sum['_LLCPWT'])

    #create percentage col
    df22_grouped_sum['Percent State'] = df22_grouped_sum['Percent_D'] * 100

    #merge with the state code dataframe
    df22_grouped_merged = df22_grouped_sum.merge(state_codes, on='_STATE', how='inner')

    #plot the map of weighted CVD per state
    fig = px.choropleth(df22_grouped_merged,
                        locations='state_code',
                        locationmode="USA-states",
                        color='Percent State',
                        scope="usa",
                        color_continuous_scale="Viridis_r")
    fig.update_layout(
        width=600,  # Specify the width in pixels
        height=400  # Specify the height in pixels
    )

    fig.update_layout(
        title=f'Weighted {col} Per State'
    )
    fig.show()

create_map_disease_type(df22, 'Heart Disease')

create_map_disease_type(df22, 'Diabetes')

Run Spearman Correlation to evaluate monotonic correlation between chronic diseases

#do not us current asthma because missing greater than 300,000 data points --> column only exists for those individuals that had asthma
correlation_analysis_columns = ['Heart Attack','Heart Disease','Stroke','Asthma','Cancer (not melanoma)','Melanoma or other','COPD or chronic Bronchitis','Depressive Disorder','Kidney Disease','Arthritis','Diabetes']
#the NA numbers are quite low, for example 2 values
correlation_df_raw = df22_reduced[correlation_analysis_columns].dropna()
#all of these variables operate on a scale of 1=Yes, 2=No, 7=Dont Know, 9=Refused
#drop 7 and 9 so that spearman correlation produces cleaner results
replace_dict = {7: np.nan, 9: np.nan}
for column in correlation_analysis_columns:
    correlation_df_raw[column] = correlation_df_raw[column].replace(replace_dict)
#there are a few thousand null values for each column --> drop these values
correlation_df_raw.dropna(inplace=True)
#final df has no missing values and 412807 instance
# use correlation_df_raw in heatmap
#check the monotonic relationship between BRFSS chronic disease predictor variables
correlation_df = correlation_df_raw.corr(method='spearman')
sns.heatmap(correlation_df, cmap='coolwarm')
plt.title('Heatmap of Spearman Correlation')
plt.show()

Switch to NHANES — evaluate BMI over time

Weight data appropriately

#Set appropriate type, declare list of year values which can be indexed
merged_timedf['Year'] = merged_timedf['Year'].astype(int)
allowed_values = [2008,2010,2012,2014,2016,2018]

#If year is in list of allowed_values, apply weighting using old variable, else apply weighting using new variable
merged_timedf['Weighted BMI'] = merged_timedf.apply(lambda row: row['BMXBMI']*row['WTMEC2YR'] if row['Year'] in allowed_values else row['BMXBMI']*row['WTMECPRP'],axis=1)
merged_timedf['Weighted Body Weight'] = merged_timedf.apply(lambda row: row['BMXWT']*row['WTMEC2YR'] if row['Year'] in allowed_values else row['BMXWT']*row['WTMECPRP'],axis=1)

Create a new data frame with weighted average BMI per period

#Create shells for BMI annual weighted average --> become df
list_bmi = []
list_body_weight = []
bmi_dict = {}
year_list = list(merged_timedf['Year'].unique())

#Iterate through all years...
#Distinguish weighting procedures between earlier years and 2020, as in previous cell
for year in year_list:
    if year in allowed_values:
        df = merged_timedf[merged_timedf['Year'] == year]
        bmi_sum = df['Weighted BMI'].sum()
        sum_weights= df['WTMEC2YR'].sum()
        list_body_weight.append(df['Weighted Body Weight'].sum()/sum_weights)     
        yearly_weighted_bmi = bmi_sum/sum_weights
        list_bmi.append(yearly_weighted_bmi)
        bmi_dict[year] = yearly_weighted_bmi
    else:
        df = merged_timedf[merged_timedf['Year'] == 2020]
        bmi_sum = df['Weighted BMI'].sum()
        sum_weights= df['WTMECPRP'].sum()
        list_body_weight.append(df['Weighted Body Weight'].sum()/sum_weights)     
        yearly_weighted_bmi = bmi_sum/sum_weights
        list_bmi.append(yearly_weighted_bmi)
        bmi_dict[year] = yearly_weighted_bmi

#Generate new dataframe based on previously defined lists and dicts
bmi_vizdf = pd.DataFrame({"Year":year_list, "BMI":list_bmi, 'Weight':list_body_weight})

Visualize BMI over time period

#BMI of 25 is considered the low range for overweight
#BMI of >30 is considered obese
plt.figure(figsize=(8, 4))
sns.lineplot(data=bmi_vizdf, x='Year', y='BMI', label='Weighted BMI Average')
plt.axhline(y=25,color='green', label ='Overweight',linestyle =":")
plt.axhline(y=30, color='red', label ='Obese')
plt.title('Average Weighted BMI Over Time')
plt.xlim([2008,2020])
plt.legend(loc ='best')
plt.show()

Evaluate the age of onset for Blood Pressure and Diabetes in NHANES data set for the period 2017–2020. Use boxplots to understand the distribution of the data.

#Select new dataframe from combined_df NHANES - sequence no., age onset blood pressure, age onset diabetes
#Drop missing values
dia_bp = combined_df[['SEQN','BPD035','DID040']].dropna()
#Remove outliers
dia_bp = dia_bp[(dia_bp['DID040'] >10) & (dia_bp['DID040'] < 600)]
dia_bp = dia_bp[dia_bp['BPD035'] < 300]
#Rename columns
dia_bp = dia_bp.rename(columns={'DID040':'Diabetes', 'BPD035':'High Blood Pressure'})
dia_bp_melt = pd.melt(dia_bp, id_vars=['SEQN'])

#Plot
sns.boxplot(data=dia_bp_melt, x='value', y='variable')
plt.xlabel('Age')
plt.ylabel('Condition')
plt.title('Age of Onset')
plt.show()

Use SPLOM to compare scatterplots of these variables and BMI for individuals with data present across all three variables.

bmidemo2020 = merged_timedf[merged_timedf['Year'] == 2020]

#merge the diabetes, blood pressure, and bmidemo data into one data set
#keep only the individuals that have data in all sets
DBPBMI = dia_bp.merge(bmidemo2020, on='SEQN',how='inner')
#rename the BMXBMI column for a better visualization
DBPBMI = DBPBMI.rename(columns={'BMXBMI':'BMI'})
#run a pair plot looking at age of onset of diabetes, high blood pressure, and BMI for individuals that have data for all sections
plot = sns.pairplot(DBPBMI[['Diabetes', 'High Blood Pressure', 'BMI']])
plot.fig.suptitle("Health Metric SPLOM", y=1)
plt.show()

Merge BRFSS and NHANES on age categories and track chronic disease with average BMI.

# Define the desired order of the category column
age_bins = [18, 25, 35, 45, 55, 65, float('inf')]
age_labels = ['18 to 24', '25 to 34', '35 to 44', '45 to 54', '55 to 64', '65 or older']
merged_timedf['Age Group'] = pd.cut(merged_timedf['RIDAGEYR'], bins=age_bins, labels=age_labels, right=False)

bmitotalweight2020 = bmidemo2020['WTMECPRP'].sum()
age_bmi = (merged_timedf.groupby(['Age Group'])[['Weighted BMI']].sum())/bmitotalweight2020
age_bmi = age_bmi.reset_index()

brfss_age_total = total_disease_group_divide_by_group_weight(df22_reduced, 'Age Group')

age_bmi_cvd_total = brfss_age_total.merge(age_bmi, how='inner', on='Age Group')
age_bmi_cvd_total_melted = age_bmi_cvd_total.melt(id_vars=['Age Group'])

# Create the initial bar plot (left y-axis)
plt.figure(figsize=(8, 6))
ax1 = sns.barplot(data = age_bmi_cvd_total_melted.iloc[:18,], x='Age Group', y='value', hue='variable')
ax1.set_xlabel('Categories')
ax1.set_ylabel('Percentage')
ax1.set_yticks([0,10,20,50,75])
ax1.legend(loc='center left')
# Create a second y-axis on the right side
ax2 = ax1.twinx()

# Plot the non-percentage values as bars on the right y-axis
sns.lineplot(data = age_bmi_cvd_total_melted.iloc[18:,], x='Age Group', y='value', label='Weighted BMI', ax=ax2)
ax2.set_ylabel('Weighted BMI')
ax2.set_yticks([0,25])
ax2.axhline(y=25,color='green', label ='Overweight',linestyle =":")
plt.title('Chronic Disease and Weighted BMI by Age')
plt.legend(loc='upper left')
# Show the plot
plt.show()

Exploratory Data Analysis: Cardiovascular Disease and Related Health Variables — Python Code

Written by Nicholas Salem