Food Subgroups
Reusing the code from Exploration Phase. The code and comments will be updated as proceeded. The code and comments might not be perfect until finalized
# PCA section, Correlation, Heatmaps, can be seen to be the most important parts
# Might have to adjust/improve my conclusions from the plots in future work# Target variable ACR i.e. Albumin Creatinine Ratio
# Other Probable targets: Kidney failed, Systolic/Diastolic Pressure
# Tried the following Target variables previously that might not even be present now. Though might bring them back if seem to be important
# ESRD patients: Avg. Annual Mortality rates is given the most importance
# ESRD patients: Total (or %) deaths for target year
# Dialysis patients: Total (or %) deaths for target year# For plotting, import libraries
from matplotlib import pyplot as plt
import matplotlib
# import seaborn as sns
%matplotlib inline
import pandas as pd
import numpy as np
from subprocess import check_output
from IPython.display import display, HTML
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
import seaborn as snsimport warnings
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)import os
#data_folder = './acr-data-for-analysis/food-subgroups/'
data_folder = './data-for-code/'
data_files = os.listdir(data_folder)
'transformed_regression_variables_after_pca_saved_for_factor_transformations.csv']

data_file = data_folder + 'average_intake_by_food_subgroups_demo_acr_kidney_blood_pressure.xlsx'
excel_file = pd.ExcelFile(data_file)# see all sheet names
sheet_names = excel_file.sheet_names
sheet_names[:4]

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)
foodgroup_acr_data_df = excel_file.parse('Sheet1')
PCA: Apply PCA to find important features
#foodgroup_acr_data_df['food_group_id']df_food_group_only = foodgroup_acr_data_df[ ['participant_id', 'food_group_name', 'avg_food_weight_in_gms'] ]
df_food_group_only_index = df_food_group_only.set_index(['participant_id', 'food_group_name'])
df_food_group_only_index_unstack = df_food_group_only_index.unstack()
df_without_food_groups = foodgroup_acr_data_df.drop(['food_group_name', 'food_group_id', 'avg_food_weight_in_gms'], axis=1)
df_without_food_groups_one_entry = df_without_food_groups.groupby(['participant_id']).mean()
merged = df_food_group_only_index_unstack.merge(df_without_food_groups_one_entry, how='inner', left_on='participant_id', right_on='participant_id', left_index=True)
# remove extra spaces in the column names
column_dict = {}
for aName in merged.columns:
#if ( aName.index( 'avg_food_weight_in_gms' ) >= 0):
if 'avg_food_weight_in_gms' in aName:
#aName =
column_dict[aName] = aName[1] #aName.strip()
column_dict[aName] = aName #.strip()
# column_dictmerged = merged.rename(index=str, columns=column_dict)
merged_f = merged.fillna(0)
merged_f.to_csv(data_folder + 'food-subgroups-subgroup-names-on-columns.csv')regr_data = pd.read_csv(data_folder + 'regression_variables_after_pca_saved_for_factor_transformations.csv')
regr_data_t = regr_data.T
regr_data_t.to_csv(data_folder + 'transformed_regression_variables_after_pca_saved_for_factor_transformations.csv')merged.columnsIndex(['Added Sugars/Sugars and sweets', 'Alcoholic beverages',
'Citrus fruits, juices', 'Dark-green vegetables', 'Dried fruits',
'Fruit juices and nectars excluding citrus',
'Fruits and juices baby food', 'Meat, Poultry and Eggs subgroup',
'Milk desserts, sauces, gravies', 'Milks and milk drinks ',
'Nonalcoholic beverages', 'Nuts, Seeds, and Soy Products', 'Oils',
'Other fruits', 'Other vegetables', 'Protein',
'Red and orange vegetables', 'Refined grains', 'Seafood', 'Solid Fats',
'Starchy vegetables', 'Water, noncarbonated', 'Whole grains',
'participant_age', 'gender', 'acr', 'kidney_failed',
'systolic_pressure', 'diastolic_pressure', 'calorie', 'protein',
'sodium', 'carbohydrate', 'sugar', 'fibre', 'fat', 'saturated_fat',
'mono_fat', 'poly_fat', 'cholesterol', 'calcium', 'phosphorous',
'magnesium', 'potassium', 'alcohol', 'a_sample_food_code',
'albumin_urine_mu_g', 'albumin_urine_mg', 'creatinine_mu_mol',
'creatinine_mg', 'received_dialysis_in_12_months', 'kidney_stones',
'passed_kidney_stones_12_months', 'urinary_leakage_frequency',
'urine_lose_each_time', 'leak_during_activities',
'how_frequent_leak_occurs', 'urinated_before_reaching_toilet',
'how_frequent', 'leak_during_nonphysical_activities',
'how_frequest_leak_nonphysical', 'how_much_leak_bothering',
'how_much_daily_activities_affected', 'count_night_time_urinate'],
[ 'Added Sugars/Sugars and sweets',
'Alcoholic beverages',
'Citrus fruits, juices',
'Dark-green vegetables',
'Dried fruits',
'Fruit juices and nectars excluding citrus',
'Fruits and juices baby food',
'Meat, Poultry and Eggs subgroup',
'Milk desserts, sauces, gravies',
'Milks and milk drinks ',
'Nonalcoholic beverages',
'Nuts, Seeds, and Soy Products', 'Oils',
'Other fruits',
'Other vegetables',
'Red and orange vegetables',
'Refined grains',
'Solid Fats',
'Starchy vegetables',
'Water, noncarbonated',
'Whole grains',
#'albumin_urine_mu_g', 'albumin_urine_mg', 'creatinine_mu_mol',
#'creatinine_mg', 'received_dialysis_in_12_months', 'kidney_stones',
#'passed_kidney_stones_12_months', 'urinary_leakage_frequency',
#'urine_lose_each_time', 'leak_during_activities',
#'how_frequent_leak_occurs', 'urinated_before_reaching_toilet',
#'how_frequent', 'leak_during_nonphysical_activities',
#'how_frequest_leak_nonphysical', 'how_much_leak_bothering',
#'how_much_daily_activities_affected', 'count_night_time_urinate'
]# removing text features and/or not that important features
#df_drop_pca = merged #.drop([ 'food_group_from_recom_amount', 'm_food_group_name', 'a_sample_food', 'a_sample_food_name'], axis=1)
#df_drop_pca = df_drop_pca.drop(['acr'], axis=1)
# 'participant_age', 'gender',
df_drop_pca.fillna(0, inplace=True)
# steps followed from ref:
import sklearn
from sklearn import preprocessing
standardisedX = sklearn.preprocessing.scale(df_drop_pca)
standardisedX = pd.DataFrame(standardisedX, index=df_drop_pca.index, columns=df_drop_pca.columns)
standardisedX.apply(np.mean)Added Sugars/Sugars and sweets 2.091783e-16
Alcoholic beverages 3.822242e-16
Citrus fruits, juices 1.694832e-17
Dark-green vegetables 9.322008e-16
Dried fruits 1.785011e-15
Fruit juices and nectars excluding citrus -2.310308e-16
Fruits and juices baby food 7.944569e-16
Meat, Poultry and Eggs subgroup -8.868298e-17
Milk desserts, sauces, gravies -3.878713e-16
Milks and milk drinks -1.797387e-16
Nonalcoholic beverages 6.543493e-17
Nuts, Seeds, and Soy Products 1.522320e-16
Oils 9.512190e-16
Other fruits -2.575567e-16
Other vegetables -3.786687e-17
Protein 1.673196e-16
Red and orange vegetables -3.572561e-16
Refined grains -1.664181e-16
Seafood 4.954679e-16
Solid Fats -4.734747e-16
Starchy vegetables -6.223422e-16
Water, noncarbonated -4.369421e-17
Whole grains 9.678572e-17
dtype: float64standardisedX.apply(np.std)
from sklearn import decomposition
#pca = decomposition.PCA(n_components=2).fit(standardisedX)
pca = decomposition.PCA().fit(standardisedX)
pca
svd_solver='auto', tol=0.0, whiten=False)#ref:
def pca_summary(pca, standardised_data, out=True):
names = ["PC"+str(i) for i in range(1, len(pca.explained_variance_ratio_)+1)]
a = list(np.std(pca.transform(standardised_data), axis=0))
b = list(pca.explained_variance_ratio_)
c = [np.sum(pca.explained_variance_ratio_[:i]) for i in range(1, len(pca.explained_variance_ratio_)+1)]
columns = pd.MultiIndex.from_tuples([("sdev", "Standard deviation"), ("varprop", "Proportion of Variance"), ("cumprop", "Cumulative Proportion")])
summary = pd.DataFrame( list(zip(a, b, c)), index=names, columns=columns)
if out:
print("Importance of components:")
return summarysummary = pca_summary(pca, standardisedX)Importance of components:
First 22 components can define 83%, 26 Comp over 90%
# ref:
def screeplot(pca, standardised_values):
y = np.std(pca.transform(standardised_values), axis=0)**2
x = np.arange(len(y)) + 1
plt.plot(x, y, "o-")
plt.xticks(x, ["Comp."+str(i) for i in x], rotation=60)
plt.title('PCA variance in Components')
plt.savefig('pca_components_variance' + '.png')['figure.figsize'] = 10, 10
screeplot(pca, standardisedX)
plt.savefig('./images/pca_components_variance_food_subgroup' + '.png')
<Figure size 720x720 with 0 Axes>
comp 3 to comp 4 is the most change — slope
first three can be retained
Other big slope changes at:5 to 6, 8 to 9, 25 to 26, 30 to 31
Though upto 3 can be significant, upto 26 can be experimented with, or even just upto 8
just upto 8 is preferred
# ref:
# not my code, I am using this as (similar to) a library function
def calcpc(variables, loadings):
# find the number of samples in the data set and the number of variables
numsamples, numvariables = variables.shape
# make a vector to store the component
pc = np.zeros(numsamples)
# calculate the value of the component for each sample
for i in range(numsamples):
valuei = 0
for j in range(numvariables):
valueij = variables.iloc[i, j]
loadingj = loadings[j]
valuei = valuei + (valueij * loadingj)
pc[i] = valuei
return pccalcpc(standardisedX, pca.components_[0])array([ 0.4024334 , 2.86288165, 0.63548882, ..., -0.27547279,
-0.40052422, 0.01408633])pca.transform(standardisedX)[:, 0]array([ 0.4024334 , 2.86288165, 0.63548882, ..., -0.27547279,
-0.40052422, 0.01408633])pca.components_[1]array([-0.04013536, -0.21662898, 0.2646046 , 0.2096937 , 0.15558643,
0.23267299, -0.01198426, -0.10661167, 0.24079041, 0.30859506,
-0.15275508, 0.22375509, 0.14752615, 0.45938378, 0.30101746,
0.23534633, 0.25186201, 0.19465321, 0.16763907, 0.08938424,
0.06329616, 0.01186119, -0.02546397])np.sum(pca.components_[1]**2)1.0000000000000004# for following code : Classes will/might be defined such as like High, low, neutral mortality in final work
y = df_drop_pca['acr'] #< 0.5
dbk= np.where(df_drop_pca['acr'] < 30, 'normal-mild', df_drop_pca['acr'])
dbk = np.where(df_drop_pca['acr'] > 30, 'moderate', df_drop_pca['acr'])
y = []
for aVal in merged['acr']:
if aVal < 30:
if aVal > 300:
elif aVal < 150:
elif aVal > 150:
...]# # ref:
# not my code from the URL above, using this as a library function
def pca_scatter(pca, standardised_values, classifs):
foo = pca.transform(standardised_values)
bar = pd.DataFrame(list(zip(foo[:, 0], foo[:, 1], classifs)), columns=["PC1", "PC2", "Class"])
#plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_components_separating_high_low_mortality' + '.png')
sns.lmplot("PC1", "PC2", bar, hue="Class", fit_reg=False)
#y = df_esrdonly[' ESRD patients: Total (or %) deaths for target year']
#y = np.std(pca.transform(standardisedX), axis=0)**2
plt.rcParams['figure.figsize'] = 12, 12
pca_scatter(pca, standardisedX, y)
#plt.title('Only two components can separate the mortality data. True = High ')
#plt.savefig('pca_components_separating_high_low_mortality' + '.png')# reference:
plt.rcParams['figure.figsize'] = 30, 16
components_to_count = 3
pca_components_cont = pca.components_[0:components_to_count]
ylabels = []
for c in range (components_to_count):
ylabels.append('Component ' + str(c) )
xlabels = []
for c in range (components_to_count):
plt.matshow(pca_components_cont, cmap='viridis')
plt.yticks(xlabels, ylabels, fontsize=10)
plt.xticks(range(len(df_drop_pca.columns)), df_drop_pca.columns, rotation=90, ha='left')
# Negative: Refined grain, Milk drink, solid fat# reference:
plt.rcParams['figure.figsize'] = 30, 16
components_to_count = 8
pca_components_cont = pca.components_[0:components_to_count]
ylabels = []
for c in range (components_to_count):
ylabels.append('Component ' + str(c) )
xlabels = []
for c in range (components_to_count):
plt.matshow(pca_components_cont, cmap='viridis')
plt.yticks(xlabels, ylabels, fontsize=10)
plt.xticks(range(len(df_drop_pca.columns)), df_drop_pca.columns, rotation=90, ha='left')
From 3 components: Non alc, other fruit, dark green alc, seafood, milk
1st component:Non alcoholic, Meat, Alcoholic, starchy vegetable, : water, whole grain, solid fat, protein 2nd Component: Other fruits, milks,
3rd component: Dark Green, Seafood
From 7 components: Fruit and Juices, non alc, nuts, oils, other fruits, meat, alc, dark v protein, refined, whole grain, fat
selection for regression
regression_variables = merged[
#'Added Sugars/Sugars and sweets', :
'Alcoholic beverages',
#'Citrus fruits, juices',
'Dark-green vegetables',
#'Dried fruits',
#'Fruit juices and nectars excluding citrus',
'Fruits and juices baby food',
'Meat, Poultry and Eggs subgroup',
'Milk desserts, sauces, gravies',
'Milks and milk drinks ',
'Nonalcoholic beverages',
'Nuts, Seeds, and Soy Products', 'Oils',
'Other fruits',
#'Other vegetables',
#'Red and orange vegetables',
'Refined grains',
'Solid Fats',
'Starchy vegetables',
'Water, noncarbonated',
'Whole grains',
plt.figure(figsize=(16, 16))
corr = regression_variables.corr()
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True,
plt.suptitle('Heatmap, Correlation All Variables');
Affecting: ACR: Milk dessert, meat and Poultry, Alcohol, High Blood Pressure related Kidney failed: dark green vegetable, Milk Dessert
Code below are not part of methodology and experiment part
# The above plot kind of shows the correlation with CKD and ACR
df_drop_pca.columnsIndex(['Added Sugars/Sugars and sweets', 'Alcoholic beverages',
'Citrus fruits, juices', 'Dark-green vegetables', 'Dried fruits',
'Fruit juices and nectars excluding citrus',
'Fruits and juices baby food', 'Meat, Poultry and Eggs subgroup',
'Milk desserts, sauces, gravies', 'Milks and milk drinks ',
'Nonalcoholic beverages', 'Nuts, Seeds, and Soy Products', 'Oils',
'Other fruits', 'Other vegetables', 'Protein',
'Red and orange vegetables', 'Refined grains', 'Seafood', 'Solid Fats',
'Starchy vegetables', 'Water, noncarbonated', 'Whole grains'],
dtype='object')df_drop_pca_important = df_drop_pca[
[ 'Dairy', 'Fats, oils, and salad dressings', 'Fruits', 'Grains',
'Protein', 'Sugars, sweets, and beverages', 'Vegetables',
'participant_age', 'gender',
'kidney_failed', 'systolic_pressure', 'diastolic_pressure',
'calorie', 'protein', 'sodium', 'carbohydrate', 'sugar', 'fibre', 'fat',
'saturated_fat', 'mono_fat', 'poly_fat', 'cholesterol', 'calcium',
'phosphorous', 'magnesium', 'potassium', 'alcohol',
'albumin_urine_mu_g', 'albumin_urine_mg',
'creatinine_mu_mol', 'creatinine_mg',
'kidney_stones', 'passed_kidney_stones_12_months',
'urinary_leakage_frequency', 'urine_lose_each_time',
'leak_during_activities', 'how_frequent_leak_occurs',
'urinated_before_reaching_toilet', 'how_frequent',
'leak_during_nonphysical_activities', 'how_frequest_leak_nonphysical',
'how_much_leak_bothering', 'how_much_daily_activities_affected'
corr = df_drop_pca_important.corr()
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True);
plt.suptitle('Heatmap, Correlation All Variables');
Will Check on Actual Taken Amount Only
age_from and age_to can be removed
df = pd.read_csv(data_folder + 'no-empty-data-only-actual-mortality_recom_added_group_data_june_9th_gender_based_data_after_processing.csv')
# df = df.drop([ 'age_from', 'age_to' ], axis=1)
age_from age_to Actual Vegetable Intake Actual Protein Intake Actual Grain Intake Actual Dairy Intake Actual Fruit intakes Actual Taken Sugars sweets and beverages amount Avg Fats oils and salad dressings taken ESRD patients: Total (or %) deaths for target year ESRD patients: Avg. Annual Mortality rates Dialysis patients: Total (or %) deaths for target year Dialysis patients: Avg. Annual Mortality rates 0.907545 0.907790 0.606775 0.040950 -0.452625 -0.408543 -0.372665 0.178760 -0.081677 1.000000 0.828148 0.999315 0.864052
ESRD patients: Avg. Annual Mortality rates 0.854841 0.854759 0.444925 -0.151035 -0.572187 -0.238108 -0.369639 0.022898 -0.202818 0.828148 1.000000 0.847294 0.995676 Dialysis patients: Total (or %) deaths for target year 0.914524 0.914723 0.603179 0.031982 -0.462540 -0.404490 -0.375241 0.172767 -0.087055 0.999315 0.847294 1.000000 0.880927 Dialysis patients: Avg. Annual Mortality rates 0.878659 0.879439 0.480531 -0.134234 -0.580723 -0.264096 -0.370520 0.050212 -0.208805 0.864052 0.995676 0.880927 1.000000
plt.figure(figsize=(12, 8))
corr = df.corr()
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True);
plt.suptitle('Heatmap, Correlation Actual Intake Amounts Only');# Without Age Groupsdf_without_ages = df.drop(['age_from', 'age_to', ' Gender'], axis=1)
df_without_ages.head()df_without_ages.corr()plt.figure(figsize=(12, 8))
corr = df_without_ages.corr()
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True);
plt.suptitle('Heatmap, Correlation Actual Intake Amounts Only');
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/univariate_food_groups_heatmaps_actual_intake_amount' + '.png')
Will use Normalization all data
df = pd.read_csv('no-empty-data-only-actual-mortality_recom_added_group_data_june_9th_gender_based_data_after_processing.csv')
df = df.drop([' Gender'], axis=1)
df.head()# normalize data
df_normalized = (df - df.mean())/ (df.max() - df.min())
# df.head()df_normalized.corr()
age_from age_to Actual Vegetable Intake Actual Protein Intake Actual Grain Intake Actual Dairy Intake Actual Fruit intakes Actual Taken Sugars sweets and beverages amount Avg Fats oils and salad dressings taken ESRD patients: Total (or %) deaths for target year ESRD patients: Avg. Annual Mortality rates Dialysis patients: Total (or %) deaths for target year Dialysis patients: Avg. Annual Mortality rates ESRD patients: Total (or %) deaths for target year 0.907545 0.907790 0.606775 0.040950 -0.452625 -0.408543 -0.372665 0.178760 -0.081677 1.000000 0.828148 0.999315 0.864052 ESRD patients: Avg. Annual Mortality rates 0.854841 0.854759 0.444925 -0.151035 -0.572187 -0.238108 -0.369639 0.022898 -0.202818 0.828148 1.000000 0.847294 0.995676 Dialysis patients: Total (or %) deaths for target year 0.914524 0.914723 0.603179 0.031982 -0.462540 -0.404490 -0.375241 0.172767 -0.087055 0.999315 0.847294 1.000000 0.880927 Dialysis patients: Avg. Annual Mortality rates 0.878659 0.879439 0.480531 -0.134234 -0.580723 -0.264096 -0.370520 0.050212 -0.208805 0.864052 0.995676 0.880927 1.000000
plt.figure(figsize=(12, 8))
corr = df_normalized.corr()
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True);
plt.suptitle('Heatmap, Correlation Actual Intake Amounts Only');
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/univariate_food_groups_heatmaps_normalized_actual_intake_amount' + '.png')
Will check only on the difference from average recommended amount
df = pd.read_csv('copy-only-diff-no-empty-data-diff-recomm-mortality_recom_added_group_data_june_9th_gender_based_data_after_processing .csv')
df.head()df.describe()# find correlation with price after standardization
age_from age_to Actual Vegetable Intake Actual Protein Intake Actual Grain Intake Actual Dairy Intake Actual Fruit intakes Actual Taken Sugars sweets and beverages amount Avg Fats oils and salad dressings taken Diff Vegetable diff protein diff grain diff dairy diff fruit ESRD patients: Total (or %) deaths for target year ESRD patients: Avg. Annual Mortality rates Dialysis patients: Total (or %) deaths for target year Dialysis patients: Avg. Annual Mortality rates ESRD patients: Total (or %) deaths for target year
0.907545 0.907790 0.606775 0.040950 -0.452625 -0.408543 -0.372665 0.178760 -0.081677 0.426499 -0.273596 -0.268453 -0.364665 -0.488155 1.000000 0.828148 0.999315 0.864052
ESRD patients: Avg. Annual Mortality rates 0.854841 0.854759 0.444925 -0.151035 -0.572187 -0.238108 -0.369639 0.022898 -0.202818 0.359590 -0.266776 -0.263227 -0.213864 -0.392090 0.828148 1.000000 0.847294 0.995676
Dialysis patients: Total (or %) deaths for target year 0.914524 0.914723 0.603179 0.031982 -0.462540 -0.404490 -0.375241 0.172767 -0.087055 0.424340 -0.278817 -0.272938 -0.362140 -0.489873 0.999315 0.847294 1.000000 0.880927
Dialysis patients: Avg. Annual Mortality rates 0.878659 0.879439 0.480531 -0.134234 -0.580723 -0.264096 -0.370520 0.050212 -0.208805 0.401294 -0.244088 -0.246893 -0.229239 -0.387759 0.864052 0.995676 0.880927 1.000000
plt.figure(figsize=(12, 8))
corr = df.corr()
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True);
plt.suptitle('Heatmap, Correlation Actual Intake Variables, and Difference from Recommended Intake Variables');df.head()
df = df.drop([' Gender'], axis=1)
df.head()# normalize data
df_normalized = (df - df.mean())/ (df.max() - df.min())
#df.head()df_normalized.corr()plt.figure(figsize=(12, 8))
corr = df_normalized.corr()
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True);
plt.suptitle('Heatmap, Correlation Actual Intake Variables, and Difference from Recommended Intake Variables');
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/univariate_food_groups_heatmaps_diff_intake_amount_normalized' + '.png')
The most important for Bivariate: bivariate_diff_norm.png and bivariate_diff_norm_rate_only.png. Difference in intake amounts from recommended, also normalized.
Bivariate plots on actual amount intake and target variables. will be saved in bivariate.png. The correlation pattern can be checked in the image saved
Correlation within the intake amounts do not say much as we are using actual intake amounts not differences with recomended amount. Also because, higher aged people will take higher amount. Also, normalized data will give true relation.
The correlation was shown above. Now linearity and non-linearity can be seen using the Bivariate plots.
The plots show similarity with correlation
import numpy as np
import pandas as pd
from IPython.display import display, HTML
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
%matplotlib inline
# ref in ref section
On Actual intake Amounts
df = pd.read_csv('no-empty-data-only-actual-mortality_recom_added_group_data_june_9th_gender_based_data_after_processing.csv')
#df_actual = df.drop([' Gender', 'age_from', 'age_to',' ESRD patients: Avg. Annual Mortality rates', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
#df_actual = df.drop([' Gender', 'age_from', 'age_to',' ESRD patients: Avg. Annual Mortality rates', ' ESRD patients: Total (or %) deaths for target year', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
df_actual = df.drop([' Gender', 'age_from', 'age_to'], axis=1)
df_actual.head()# on actual amounts
#plt.figure(figsize=(16, 300))
sns.pairplot(df_actual, vars=df_actual.columns, size=5, kind='reg'); # diag_kind='kde',
plt.title('Bivariate Plot, All Actual Taken Variables, Total ESRD target variable');
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/bivariate_food_group' + '.png')
On difference from recommended amount (data not normalized)
df = pd.read_csv('copy-only-diff-no-actual-no-empty-cell-diff-recomm-mortality_recom_added_group_data_june_9th_gender_based_data_after_processing .csv')
df.head()# On difference from recommended amount (data not normalized)
#df_actual_diff = df.drop(['age_from', 'age_to',' ESRD patients: Avg. Annual Mortality rates', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
#df_actual = df.drop([' Gender', 'age_from', 'age_to',' ESRD patients: Avg. Annual Mortality rates', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
#df_actual = df.drop([' Gender', 'age_from', 'age_to',' ESRD patients: Avg. Annual Mortality rates', ' ESRD patients: Total (or %) deaths for target year', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
df_actual_diff = df.drop(['age_from', 'age_to'], axis=1)
#df.head()# On difference from recommended amount (data not normalized)
sns.pairplot(df_actual_diff, vars=df_actual_diff.columns, size=5, kind='reg');
#plt.title('Bivariate Plot, All Actual Taken Variables, Total ESRD target variable');
#plt.savefig('./saved_images_from_visualizations/' + 'line_' +indicator.replace(' ', '_')[0:10] + '_' + str(np.random.randint(0, 99999)) + '.png')
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/bivariate_diff' + '.png')
On difference from recommended amount (data/diff normalized)
df_normalized_diff = (df_actual_diff - df_actual_diff.mean())/ (df_actual_diff.max() - df_actual_diff.min())
sns.pairplot(df_normalized_diff, vars=df_normalized_diff.columns, size=5, kind='reg', aspect=1); # diag_kind='kde',
#plt.title('Bivariate Plot, All Actual Taken Variables, Total ESRD target variable');
#plt.savefig('./saved_images_from_visualizations/' + 'line_' +indicator.replace(' ', '_')[0:10] + '_' + str(np.random.randint(0, 99999)) + '.png')
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/bivariate_diff_norm' + '.png') = df_normalized_diff.drop([' ESRD patients: Total (or %) deaths for target year', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates'], axis=1)
df_normalized_diff.head()sns.pairplot(df_normalized_diff, vars=df_normalized_diff.columns, size=5, kind='reg', aspect=1); # diag_kind='kde',
plt.suptitle('Bivariate : Diff : Food Group: Normalized\n')
plt.ylabel('Difference in Intake amount from Recommended : Normalized')
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/bivariate_diff_norm_rate_only' + '.png') can be ignored
Applying PCA on Difference data
df_esrdonly = df.drop(['age_from', 'age_to',' ESRD patients: Total (or %) deaths for target year', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
#df_esrdonly.head()# steps followed from ref:
import sklearn
from sklearn import preprocessing
standardisedX = sklearn.preprocessing.scale(df_esrdonly)
standardisedX = pd.DataFrame(standardisedX, index=df_esrdonly.index, columns=df_esrdonly.columns)
standardisedX.apply(np.mean)standardisedX.apply(np.std)from sklearn import decomposition
pca = decomposition.PCA(n_components=2).fit(standardisedX)
pca = decomposition.PCA().fit(standardisedX)
def pca_summary(pca, standardised_data, out=True):
names = ["PC"+str(i) for i in range(1, len(pca.explained_variance_ratio_)+1)]
a = list(np.std(pca.transform(standardised_data), axis=0))
b = list(pca.explained_variance_ratio_)
c = [np.sum(pca.explained_variance_ratio_[:i]) for i in range(1, len(pca.explained_variance_ratio_)+1)]
columns = pd.MultiIndex.from_tuples([("sdev", "Standard deviation"), ("varprop", "Proportion of Variance"), ("cumprop", "Cumulative Proportion")])
summary = pd.DataFrame( list(zip(a, b, c)), index=names, columns=columns)
if out:
print("Importance of components:")
return summary

summary = pca_summary(pca, standardisedX)
plt.rcParams['figure.figsize'] = 8, 8
def screeplot(pca, standardised_values):
y = np.std(pca.transform(standardised_values), axis=0)**2
x = np.arange(len(y)) + 1
plt.plot(x, y, "o-")
plt.xticks(x, ["Comp."+str(i) for i in x], rotation=60)
plt.title('PCA variance in Components')
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_components_variance' + '.png')
screeplot(pca, standardisedX)
#plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_components_variance' + '.png')# comp 2 to comp 3 is the most change - slope
# first two or at best first 3 can be retainedsummary.sdev**2
Can be retained
PC1 4.263124 PC2 1.449435
pca.components_[0]np.sum(pca.components_[0]**2)# ref:
# not my code, I am using this as (similar to) a library function
def calcpc(variables, loadings):
# find the number of samples in the data set and the number of variables
numsamples, numvariables = variables.shape
# make a vector to store the component
pc = np.zeros(numsamples)
# calculate the value of the component for each sample
for i in range(numsamples):
valuei = 0
for j in range(numvariables):
valueij = variables.iloc[i, j]
loadingj = loadings[j]
valuei = valuei + (valueij * loadingj)
pc[i] = valuei
return pccalcpc(standardisedX, pca.components_[0])pca.transform(standardisedX)[:, 0]pca.components_[1]np.sum(pca.components_[1]**2)
highest loadings for
# for following code : Classes will/might be defined such as like High, low, neutral mortality in final workdf_esrdonly[' ESRD patients: Avg. Annual Mortality rates']
sorted(standardisedX[' ESRD patients: Avg. Annual Mortality rates'])# Define high and low mortalityy = df_esrdonly[' ESRD patients: Avg. Annual Mortality rates'] #< 0.5
# from normalized data. > 0.5 = high mortality
print('as the older age show higher mortality, does it mean th')
y = standardisedX[' ESRD patients: Avg. Annual Mortality rates'] > 0.5
y = standardisedX[' ESRD patients: Avg. Annual Mortality rates'] > 0.5
y# # ref:
# not my code from the URL above, using this as a library function
def pca_scatter(pca, standardised_values, classifs):
foo = pca.transform(standardised_values)
bar = pd.DataFrame(list(zip(foo[:, 0], foo[:, 1], classifs)), columns=["PC1", "PC2", "Class"])
#plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_components_separating_high_low_mortality' + '.png')
sns.lmplot("PC1", "PC2", bar, hue="Class", fit_reg=False)
#y = df_esrdonly[' ESRD patients: Total (or %) deaths for target year']
#y = np.std(pca.transform(standardisedX), axis=0)**2
pca_scatter(pca, standardisedX, y)
# y can be used as classes like High, low, neutral mortality
# plt.suptitle('Mortality class and Principle components, y can be used as classes like High, low, neutral mortality');
plt.title('Only two components can separate the mortality data. True = High ')
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_components_separating_high_low_mortality' + '.png')# reference:
pca_components_cont = pca.components_[0:2]
plt.matshow(pca_components_cont, cmap='viridis')
plt.yticks([0,1],['1st Comp','2nd Comp'],fontsize=10)
#plt.xticks(range(1, len(df_esrdonly.columns)),df_esrdonly.columns[1:len(df_esrdonly.columns)],rotation=65)
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_food_groups_what_contributes_to_PCA_components' + '.png') from the abobe plot, Vegetable, Grain, Protein contribute the most to the 1st component
How the intake differences contributed for the affecting PCA components
diffs = list(df_esrdonly.columns[:-1])
import seaborn as sns
s = sns.heatmap(df_esrdonly[diffs].corr(),cmap='coolwarm')
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_food_groups_how_in_together_influencing_PCA_components' + '.png')
Vegetable by itself is important — contributing factors
protein + grain + then fruit
grain with fruit — slighly with dairy
dairy with protein and grain*
fruit with protein and grain then dairy
#ref #
Number of columns to display
Misc to help with Regression
#folder = './data-for-code/' #'C:/Users/Sayed Ahmed/mrp_project_implementation/phase methodology and experiments/excel-xlstat-analysis/acr-data-for-analysis/food-groups/' #/regression_consolidated-food-group-based-data-with-acr-and-blood-pressure-2015-2016.xlsx'
file = data_folder + 'regression_consolidated-food-group-based-data-with-acr-and-blood-pressure-2015-2016.xlsx'
file
fileexcel_file = pd.ExcelFile(file)
# see all sheet names
sheet_names = excel_file.sheet_names
sheet_names#[:4]pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 10000)
foodgroup_acr_data_df = excel_file.parse('consolidated-food-group-acr')
foodgroup_acr_data_df.head(3)df_food_group_only = foodgroup_acr_data_df[ ['participant_id', 'food_group_name', 'avg_food_weight_in_gms'] ]
df_food_group_only.head()df_food_group_only.Tdf_food_group_only_index = df_food_group_only.set_index(['participant_id', 'food_group_name'])
df_food_group_only_indexdf_food_group_only_index_unstack = df_food_group_only_index.unstack()df_food_group_only_index_unstack.head()foodgroup_acr_data_df.columnsdf_without_food_groups = foodgroup_acr_data_df.drop(['food_group_name', 'avg_food_weight_in_gms'], axis=1)
df_without_food_groups.head()df_without_food_groups_one_entry = df_without_food_groups.groupby(['participant_id']).mean()merged = df_food_group_only_index_unstack.merge(df_without_food_groups_one_entry, how='inner', left_on='participant_id', right_on='participant_id', left_index=True)
merged.shapemerged.head()foodgroup_acr_data_df_t = foodgroup_acr_data_df.T
foodgroup_acr_data_df_t.head(10)df_multi = foodgroup_acr_data_df.set_index(['participant_id', 'food_group_name'])
df_multidf_multi = df_multi.unstack()df_multi.head()df_some = foodgroup_acr_data_df[['participant_id', 'participant_age']]
df_some.head()df_some.merge(df_multi, how='left', left_on='participant_id', right_on='participant_id')Ref: backup
