Food Subgroups

Reusing the code from Exploration Phase. The code and comments will be updated as proceeded. The code and comments might not be perfect until finalized

# PCA section, Correlation, Heatmaps, can be seen to be the most important parts
# Might have to adjust/improve my conclusions from the plots in future work
# Target variable ACR i.e. Albumin Creatinine Ratio
# Other Probable targets: Kidney failed, Systolic/Diastolic Pressure

# Tried the following Target variables previously that might not even be present now. Though might bring them back if seem to be important
# ESRD patients: Avg. Annual Mortality rates is given the most importance
# ESRD patients: Total (or %) deaths for target year
# Dialysis patients: Total (or %) deaths for target year
# For plotting, import libraries
from matplotlib import pyplot as plt
import matplotlib

# import seaborn as sns
%matplotlib inline
import pandas as pd


import numpy as np

from subprocess import check_output

from IPython.display import display, HTML
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches

import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)
import os

#data_folder = './acr-data-for-analysis/food-subgroups/'
data_folder = './data-for-code/'

data_files = os.listdir(data_folder)
data_files
['average_intake_by_food_subgroups_demo_acr_kidney_blood_pressure.xlsx',
'food-subgroups-subgroup-names-on-columns.csv',
'more-col-recom-amt-2015-2016-consolidated-food-group-based-data-with-acr-and-blood-pressure.xlsx',
'regression_consolidated-food-group-based-data-with-acr-and-blood-pressure-2015-2016.xlsx',
'regression_variables_after_pca_saved_for_factor_transformations.csv',
'regression_variables_after_pca_saved_for_factor_transformations_food_subgroup.csv',
'transformed_regression_variables_after_pca_saved_for_factor_transformations.csv']
data_file = data_folder + 'average_intake_by_food_subgroups_demo_acr_kidney_blood_pressure.xlsx'
excel_file = pd.ExcelFile(data_file)
# see all sheet names
sheet_names = excel_file.sheet_names
sheet_names[:4]
['Sheet1']pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)

foodgroup_acr_data_df = excel_file.parse('Sheet1')
foodgroup_acr_data_df.head(5)
png
foodgroup_acr_data_df.describe()
png

PCA: Apply PCA to find important features

#foodgroup_acr_data_df['food_group_id']df_food_group_only = foodgroup_acr_data_df[ ['participant_id',  'food_group_name', 'avg_food_weight_in_gms'] ]
df_food_group_only.head()

#df_food_group_only.T

df_food_group_only_index = df_food_group_only.set_index(['participant_id', 'food_group_name'])
df_food_group_only_index

df_food_group_only_index_unstack = df_food_group_only_index.unstack()

df_food_group_only_index_unstack.head()
png
foodgroup_acr_data_df.columns

df_without_food_groups = foodgroup_acr_data_df.drop(['food_group_name', 'food_group_id', 'avg_food_weight_in_gms'], axis=1)
df_without_food_groups.head()

df_without_food_groups_one_entry = df_without_food_groups.groupby(['participant_id']).mean()
df_without_food_groups_one_entry.head()
png
merged = df_food_group_only_index_unstack.merge(df_without_food_groups_one_entry,  how='inner', left_on='participant_id', right_on='participant_id', left_index=True)
merged.head()
merged.shape
merged.head()
png
# remove extra spaces in the column names
column_dict = {}
for aName in merged.columns:
#print(aName)
try:
#if ( aName.index( 'avg_food_weight_in_gms' ) >= 0):
if 'avg_food_weight_in_gms' in aName:
#aName =
column_dict[aName] = aName[1] #aName.strip()
else:
column_dict[aName] = aName #.strip()

except:
print(aName)
continue

# column_dict
merged = merged.rename(index=str, columns=column_dict)
merged.head()
png
merged_f = merged.fillna(0)
merged_f.to_csv(data_folder + 'food-subgroups-subgroup-names-on-columns.csv')
regr_data = pd.read_csv(data_folder + 'regression_variables_after_pca_saved_for_factor_transformations.csv')
regr_data_t = regr_data.T
regr_data_t.to_csv(data_folder + 'transformed_regression_variables_after_pca_saved_for_factor_transformations.csv')
merged.columnsIndex(['Added Sugars/Sugars and sweets', 'Alcoholic beverages',
'Citrus fruits, juices', 'Dark-green vegetables', 'Dried fruits',
'Fruit juices and nectars excluding citrus',
'Fruits and juices baby food', 'Meat, Poultry and Eggs subgroup',
'Milk desserts, sauces, gravies', 'Milks and milk drinks ',
'Nonalcoholic beverages', 'Nuts, Seeds, and Soy Products', 'Oils',
'Other fruits', 'Other vegetables', 'Protein',
'Red and orange vegetables', 'Refined grains', 'Seafood', 'Solid Fats',
'Starchy vegetables', 'Water, noncarbonated', 'Whole grains',
'participant_age', 'gender', 'acr', 'kidney_failed',
'systolic_pressure', 'diastolic_pressure', 'calorie', 'protein',
'sodium', 'carbohydrate', 'sugar', 'fibre', 'fat', 'saturated_fat',
'mono_fat', 'poly_fat', 'cholesterol', 'calcium', 'phosphorous',
'magnesium', 'potassium', 'alcohol', 'a_sample_food_code',
'albumin_urine_mu_g', 'albumin_urine_mg', 'creatinine_mu_mol',
'creatinine_mg', 'received_dialysis_in_12_months', 'kidney_stones',
'passed_kidney_stones_12_months', 'urinary_leakage_frequency',
'urine_lose_each_time', 'leak_during_activities',
'how_frequent_leak_occurs', 'urinated_before_reaching_toilet',
'how_frequent', 'leak_during_nonphysical_activities',
'how_frequest_leak_nonphysical', 'how_much_leak_bothering',
'how_much_daily_activities_affected', 'count_night_time_urinate'],
dtype='object')
merged.columnsIndex(['Added Sugars/Sugars and sweets', 'Alcoholic beverages',
'Citrus fruits, juices', 'Dark-green vegetables', 'Dried fruits',
'Fruit juices and nectars excluding citrus',
'Fruits and juices baby food', 'Meat, Poultry and Eggs subgroup',
'Milk desserts, sauces, gravies', 'Milks and milk drinks ',
'Nonalcoholic beverages', 'Nuts, Seeds, and Soy Products', 'Oils',
'Other fruits', 'Other vegetables', 'Protein',
'Red and orange vegetables', 'Refined grains', 'Seafood', 'Solid Fats',
'Starchy vegetables', 'Water, noncarbonated', 'Whole grains',
'participant_age', 'gender', 'acr', 'kidney_failed',
'systolic_pressure', 'diastolic_pressure', 'calorie', 'protein',
'sodium', 'carbohydrate', 'sugar', 'fibre', 'fat', 'saturated_fat',
'mono_fat', 'poly_fat', 'cholesterol', 'calcium', 'phosphorous',
'magnesium', 'potassium', 'alcohol', 'a_sample_food_code',
'albumin_urine_mu_g', 'albumin_urine_mg', 'creatinine_mu_mol',
'creatinine_mg', 'received_dialysis_in_12_months', 'kidney_stones',
'passed_kidney_stones_12_months', 'urinary_leakage_frequency',
'urine_lose_each_time', 'leak_during_activities',
'how_frequent_leak_occurs', 'urinated_before_reaching_toilet',
'how_frequent', 'leak_during_nonphysical_activities',
'how_frequest_leak_nonphysical', 'how_much_leak_bothering',
'how_much_daily_activities_affected', 'count_night_time_urinate'],
dtype='object')
df_drop_pca = merged[

[ 'Added Sugars/Sugars and sweets',
'Alcoholic beverages',
'Citrus fruits, juices',
'Dark-green vegetables',
'Dried fruits',
'Fruit juices and nectars excluding citrus',
'Fruits and juices baby food',
'Meat, Poultry and Eggs subgroup',
'Milk desserts, sauces, gravies',
'Milks and milk drinks ',
'Nonalcoholic beverages',
'Nuts, Seeds, and Soy Products', 'Oils',
'Other fruits',
'Other vegetables',
'Protein',
'Red and orange vegetables',
'Refined grains',
'Seafood',
'Solid Fats',
'Starchy vegetables',
'Water, noncarbonated',
'Whole grains',
#'participant_age',
#'gender',
#'acr',
#'kidney_failed',
#'systolic_pressure',
#'diastolic_pressure',
#'calorie',
#'protein',
#'sodium',
#'carbohydrate',
#'sugar',
#'fibre',
#'fat',
#'saturated_fat',
#'mono_fat',
#'poly_fat',
#'cholesterol',
#'calcium',
#'phosphorous',
#'magnesium',
#'potassium',
#'alcohol',

#'a_sample_food_code',
#'albumin_urine_mu_g', 'albumin_urine_mg', 'creatinine_mu_mol',
#'creatinine_mg', 'received_dialysis_in_12_months', 'kidney_stones',
#'passed_kidney_stones_12_months', 'urinary_leakage_frequency',
#'urine_lose_each_time', 'leak_during_activities',
#'how_frequent_leak_occurs', 'urinated_before_reaching_toilet',
#'how_frequent', 'leak_during_nonphysical_activities',
#'how_frequest_leak_nonphysical', 'how_much_leak_bothering',
#'how_much_daily_activities_affected', 'count_night_time_urinate'
]


]
# removing text features and/or not that important features
#foodgroup_acr_data_df
#df_drop_pca = merged #.drop([ 'food_group_from_recom_amount', 'm_food_group_name', 'a_sample_food', 'a_sample_food_name'], axis=1)
#df_drop_pca = df_drop_pca.drop(['acr'], axis=1)
# 'participant_age', 'gender',
df_drop_pca.fillna(0, inplace=True)

df_drop_pca.head(10)
png
# steps followed from ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html
import sklearn
from sklearn import preprocessing
standardisedX = sklearn.preprocessing.scale(df_drop_pca)
standardisedX = pd.DataFrame(standardisedX, index=df_drop_pca.index, columns=df_drop_pca.columns)
standardisedX.apply(np.mean)
Added Sugars/Sugars and sweets 2.091783e-16
Alcoholic beverages 3.822242e-16
Citrus fruits, juices 1.694832e-17
Dark-green vegetables 9.322008e-16
Dried fruits 1.785011e-15
Fruit juices and nectars excluding citrus -2.310308e-16
Fruits and juices baby food 7.944569e-16
Meat, Poultry and Eggs subgroup -8.868298e-17
Milk desserts, sauces, gravies -3.878713e-16
Milks and milk drinks -1.797387e-16
Nonalcoholic beverages 6.543493e-17
Nuts, Seeds, and Soy Products 1.522320e-16
Oils 9.512190e-16
Other fruits -2.575567e-16
Other vegetables -3.786687e-17
Protein 1.673196e-16
Red and orange vegetables -3.572561e-16
Refined grains -1.664181e-16
Seafood 4.954679e-16
Solid Fats -4.734747e-16
Starchy vegetables -6.223422e-16
Water, noncarbonated -4.369421e-17
Whole grains 9.678572e-17
dtype: float64
standardisedX.apply(np.std)
from sklearn import decomposition
#pca = decomposition.PCA(n_components=2).fit(standardisedX)
pca = decomposition.PCA().fit(standardisedX)
pca
PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)
#ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html
def pca_summary(pca, standardised_data, out=True):
names = ["PC"+str(i) for i in range(1, len(pca.explained_variance_ratio_)+1)]
a = list(np.std(pca.transform(standardised_data), axis=0))
b = list(pca.explained_variance_ratio_)
c = [np.sum(pca.explained_variance_ratio_[:i]) for i in range(1, len(pca.explained_variance_ratio_)+1)]
columns = pd.MultiIndex.from_tuples([("sdev", "Standard deviation"), ("varprop", "Proportion of Variance"), ("cumprop", "Cumulative Proportion")])
summary = pd.DataFrame( list(zip(a, b, c)), index=names, columns=columns)
if out:
print("Importance of components:")
display(summary)
return summary
summary = pca_summary(pca, standardisedX)Importance of components:
png

First 22 components can define 83%, 26 Comp over 90%

# ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html
def screeplot(pca, standardised_values):
y = np.std(pca.transform(standardised_values), axis=0)**2
x = np.arange(len(y)) + 1
plt.plot(x, y, "o-")
plt.xticks(x, ["Comp."+str(i) for i in x], rotation=60)
plt.ylabel("Variance")
plt.title('PCA variance in Components')
plt.savefig('pca_components_variance' + '.png')
plt.show()
plt.rcParams['figure.figsize'] = 10, 10
screeplot(pca, standardisedX)
plt.savefig('./images/pca_components_variance_food_subgroup' + '.png')
png
<Figure size 720x720 with 0 Axes>

comp 3 to comp 4 is the most change — slope

first three can be retained

Other big slope changes at:5 to 6, 8 to 9, 25 to 26, 30 to 31

Though upto 3 can be significant, upto 26 can be experimented with, or even just upto 8

just upto 8 is preferred

# ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html
# not my code, I am using this as (similar to) a library function
def calcpc(variables, loadings):
# find the number of samples in the data set and the number of variables
numsamples, numvariables = variables.shape
# make a vector to store the component
pc = np.zeros(numsamples)
# calculate the value of the component for each sample
for i in range(numsamples):
valuei = 0
for j in range(numvariables):
valueij = variables.iloc[i, j]
loadingj = loadings[j]
valuei = valuei + (valueij * loadingj)
pc[i] = valuei
return pc
calcpc(standardisedX, pca.components_[0])array([ 0.4024334 , 2.86288165, 0.63548882, ..., -0.27547279,
-0.40052422, 0.01408633])
pca.transform(standardisedX)[:, 0]array([ 0.4024334 , 2.86288165, 0.63548882, ..., -0.27547279,
-0.40052422, 0.01408633])
pca.components_[1]array([-0.04013536, -0.21662898, 0.2646046 , 0.2096937 , 0.15558643,
0.23267299, -0.01198426, -0.10661167, 0.24079041, 0.30859506,
-0.15275508, 0.22375509, 0.14752615, 0.45938378, 0.30101746,
0.23534633, 0.25186201, 0.19465321, 0.16763907, 0.08938424,
0.06329616, 0.01186119, -0.02546397])
np.sum(pca.components_[1]**2)1.0000000000000004# for following code : Classes will/might be defined such as like High, low, neutral mortality in final work
#df_drop_pca['acr']
#sorted(standardisedX['acr'])
"""
y = df_drop_pca['acr'] #< 0.5
#y
dbk= np.where(df_drop_pca['acr'] < 30, 'normal-mild', df_drop_pca['acr'])
dbk = np.where(df_drop_pca['acr'] > 30, 'moderate', df_drop_pca['acr'])
dbk
"""

y = []
for aVal in merged['acr']:
if aVal < 30:
y.append('normal-mild')
if aVal > 300:
y.append('severe')
elif aVal < 150:
y.append('mild-moderate-low')
elif aVal > 150:
#print(aVal)
y.append('mild-moderate-high')

y
['normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-high',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-high',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-high',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-high',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-high',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-high',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-high',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
...]
# # ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html
# not my code from the URL above, using this as a library function
def pca_scatter(pca, standardised_values, classifs):
foo = pca.transform(standardised_values)
bar = pd.DataFrame(list(zip(foo[:, 0], foo[:, 1], classifs)), columns=["PC1", "PC2", "Class"])
#plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_components_separating_high_low_mortality' + '.png')
sns.lmplot("PC1", "PC2", bar, hue="Class", fit_reg=False)

#y = df_esrdonly[' ESRD patients: Total (or %) deaths for target year']
#y = np.std(pca.transform(standardisedX), axis=0)**2

plt.rcParams['figure.figsize'] = 12, 12
pca_scatter(pca, standardisedX, y)
png
#plt.title('Only two components can separate the mortality data. True = High ')
#plt.savefig('pca_components_separating_high_low_mortality' + '.png')
# reference: https://towardsdatascience.com/dive-into-pca-principal-component-analysis-with-python-43ded13ead21
plt.rcParams['figure.figsize'] = 30, 16
components_to_count = 3

pca_components_cont = pca.components_[0:components_to_count]
ylabels = []

for c in range (components_to_count):
ylabels.append('Component ' + str(c) )

xlabels = []
for c in range (components_to_count):
xlabels.append(c)

plt.matshow(pca_components_cont, cmap='viridis')
plt.yticks(xlabels, ylabels, fontsize=10)
plt.colorbar()

plt.xticks(range(len(df_drop_pca.columns)), df_drop_pca.columns, rotation=90, ha='left')
plt.show()#
png
# Negative: Refined grain, Milk drink, solid fat# reference: https://towardsdatascience.com/dive-into-pca-principal-component-analysis-with-python-43ded13ead21
plt.rcParams['figure.figsize'] = 30, 16
components_to_count = 8

pca_components_cont = pca.components_[0:components_to_count]
ylabels = []

for c in range (components_to_count):
ylabels.append('Component ' + str(c) )

xlabels = []
for c in range (components_to_count):
xlabels.append(c)

plt.matshow(pca_components_cont, cmap='viridis')
plt.yticks(xlabels, ylabels, fontsize=10)
plt.colorbar()

plt.xticks(range(len(df_drop_pca.columns)), df_drop_pca.columns, rotation=90, ha='left')
plt.show()#
png

From 3 components: Non alc, other fruit, dark green alc, seafood, milk

1st component:Non alcoholic, Meat, Alcoholic, starchy vegetable, : water, whole grain, solid fat, protein 2nd Component: Other fruits, milks,
3rd component: Dark Green, Seafood

From 7 components: Fruit and Juices, non alc, nuts, oils, other fruits, meat, alc, dark v protein, refined, whole grain, fat

selection for regression

regression_variables = merged[ 

[

#'Added Sugars/Sugars and sweets', :
'Alcoholic beverages',
#'Citrus fruits, juices',
'Dark-green vegetables',
#'Dried fruits',
#'Fruit juices and nectars excluding citrus',
'Fruits and juices baby food',
'Meat, Poultry and Eggs subgroup',
'Milk desserts, sauces, gravies',
'Milks and milk drinks ',
'Nonalcoholic beverages',
'Nuts, Seeds, and Soy Products', 'Oils',
'Other fruits',
#'Other vegetables',
'Protein',
#'Red and orange vegetables',
'Refined grains',
'Seafood',
'Solid Fats',
'Starchy vegetables',
'Water, noncarbonated',
'Whole grains',
#'participant_age',
#'gender',
'acr',
'kidney_failed',
'systolic_pressure',
'diastolic_pressure'
]
]
regression_variables.corr()
png
plt.figure(figsize=(16, 16))

corr = regression_variables.corr()
sns.heatmap(corr,
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True,
cmap="BuPu");
plt.suptitle('Heatmap, Correlation All Variables');
plt.savefig('./images/regression_food_subgroup_and_acr.png')
png

Affecting: ACR: Milk dessert, meat and Poultry, Alcohol, High Blood Pressure related Kidney failed: dark green vegetable, Milk Dessert

Code below are not part of methodology and experiment part

# The above plot kind of shows the correlation with CKD and ACR
df_drop_pca.columns
Index(['Added Sugars/Sugars and sweets', 'Alcoholic beverages',
'Citrus fruits, juices', 'Dark-green vegetables', 'Dried fruits',
'Fruit juices and nectars excluding citrus',
'Fruits and juices baby food', 'Meat, Poultry and Eggs subgroup',
'Milk desserts, sauces, gravies', 'Milks and milk drinks ',
'Nonalcoholic beverages', 'Nuts, Seeds, and Soy Products', 'Oils',
'Other fruits', 'Other vegetables', 'Protein',
'Red and orange vegetables', 'Refined grains', 'Seafood', 'Solid Fats',
'Starchy vegetables', 'Water, noncarbonated', 'Whole grains'],
dtype='object')
df_drop_pca_important = df_drop_pca[


[ 'Dairy', 'Fats, oils, and salad dressings', 'Fruits', 'Grains',
'Protein', 'Sugars, sweets, and beverages', 'Vegetables',
'participant_age', 'gender',
#'age_from_recom_amount',
#'avg_taken_from_recom_amount',
#'recommended_low',
#'recommended_high',
'acr',
'kidney_failed', 'systolic_pressure', 'diastolic_pressure',
'calorie', 'protein', 'sodium', 'carbohydrate', 'sugar', 'fibre', 'fat',
'saturated_fat', 'mono_fat', 'poly_fat', 'cholesterol', 'calcium',
'phosphorous', 'magnesium', 'potassium', 'alcohol',
#'a_sample_food_code',
'albumin_urine_mu_g', 'albumin_urine_mg',
'creatinine_mu_mol', 'creatinine_mg',
#'received_dialysis_in_12_months',
'kidney_stones', 'passed_kidney_stones_12_months',
'urinary_leakage_frequency', 'urine_lose_each_time',
'leak_during_activities', 'how_frequent_leak_occurs',
'urinated_before_reaching_toilet', 'how_frequent',
'leak_during_nonphysical_activities', 'how_frequest_leak_nonphysical',
'how_much_leak_bothering', 'how_much_daily_activities_affected'
'count_night_time_urinate'
]

]
---------------------------------------------------------------------------

KeyError Traceback (most recent call last)

<ipython-input-44-b95dd96c8f16> in <module>()
23 'urinated_before_reaching_toilet', 'how_frequent',
24 'leak_during_nonphysical_activities', 'how_frequest_leak_nonphysical',
---> 25 'how_much_leak_bothering', 'how_much_daily_activities_affected'
26 'count_night_time_urinate'
27 ]


C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2677 if isinstance(key, (Series, np.ndarray, Index, list)):
2678 # either boolean or fancy integer index
-> 2679 return self._getitem_array(key)
2680 elif isinstance(key, DataFrame):
2681 return self._getitem_frame(key)


C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pandas\core\frame.py in _getitem_array(self, key)
2721 return self._take(indexer, axis=0)
2722 else:
-> 2723 indexer = self.loc._convert_to_indexer(key, axis=1)
2724 return self._take(indexer, axis=1)
2725


C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pandas\core\indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
1325 if mask.any():
1326 raise KeyError('{mask} not in index'
-> 1327 .format(mask=objarr[mask]))
1328
1329 return com._values_from_object(indexer)


KeyError: "['Dairy' 'Fats, oils, and salad dressings' 'Fruits' 'Grains'\n 'Sugars, sweets, and beverages' 'Vegetables' 'participant_age' 'gender'\n 'acr' 'kidney_failed' 'systolic_pressure' 'diastolic_pressure' 'calorie'\n 'protein' 'sodium' 'carbohydrate' 'sugar' 'fibre' 'fat' 'saturated_fat'\n 'mono_fat' 'poly_fat' 'cholesterol' 'calcium' 'phosphorous' 'magnesium'\n 'potassium' 'alcohol' 'albumin_urine_mu_g' 'albumin_urine_mg'\n 'creatinine_mu_mol' 'creatinine_mg' 'kidney_stones'\n 'passed_kidney_stones_12_months' 'urinary_leakage_frequency'\n 'urine_lose_each_time' 'leak_during_activities'\n 'how_frequent_leak_occurs' 'urinated_before_reaching_toilet'\n 'how_frequent' 'leak_during_nonphysical_activities'\n 'how_frequest_leak_nonphysical' 'how_much_leak_bothering'\n 'how_much_daily_activities_affectedcount_night_time_urinate'] not in index"
df_drop_pca_important.corr()plt.figure(figsize=(12, 8))

corr = df_drop_pca_important.corr()
sns.heatmap(corr,
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True);
plt.suptitle('Heatmap, Correlation All Variables');

Will Check on Actual Taken Amount Only

age_from and age_to can be removed

df = pd.read_csv(data_folder + 'no-empty-data-only-actual-mortality_recom_added_group_data_june_9th_gender_based_data_after_processing.csv')
# df = df.drop([ 'age_from', 'age_to' ], axis=1)
df.head()
---------------------------------------------------------------------------

FileNotFoundError Traceback (most recent call last)

<ipython-input-45-69caa1d1a88a> in <module>()
----> 1 df = pd.read_csv(data_folder + 'no-empty-data-only-actual-mortality_recom_added_group_data_june_9th_gender_based_data_after_processing.csv')
2 # df = df.drop([ 'age_from', 'age_to' ], axis=1)
3 df.head()


C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, doublequote, delim_whitespace, low_memory, memory_map, float_precision)
676 skip_blank_lines=skip_blank_lines)
677
--> 678 return _read(filepath_or_buffer, kwds)
679
680 parser_f.__name__ = name


C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
438
439 # Create the parser.
--> 440 parser = TextFileReader(filepath_or_buffer, **kwds)
441
442 if chunksize or iterator:


C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds)
785 self.options['has_index_names'] = kwds['has_index_names']
786
--> 787 self._make_engine(self.engine)
788
789 def close(self):


C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine)
1012 def _make_engine(self, engine='c'):
1013 if engine == 'c':
-> 1014 self._engine = CParserWrapper(self.f, **self.options)
1015 else:
1016 if engine == 'python':


C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pandas\io\parsers.py in __init__(self, src, **kwds)
1706 kwds['usecols'] = self.usecols
1707
-> 1708 self._reader = parsers.TextReader(src, **kwds)
1709
1710 passed_names = self.names is None


pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()


pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()


FileNotFoundError: File b'./data-for-code/no-empty-data-only-actual-mortality_recom_added_group_data_june_9th_gender_based_data_after_processing.csv' does not exist
df.describe()#find correlation with price after standardization
df.corr()

age_from age_to Actual Vegetable Intake Actual Protein Intake Actual Grain Intake Actual Dairy Intake Actual Fruit intakes Actual Taken Sugars sweets and beverages amount Avg Fats oils and salad dressings taken ESRD patients: Total (or %) deaths for target year ESRD patients: Avg. Annual Mortality rates Dialysis patients: Total (or %) deaths for target year Dialysis patients: Avg. Annual Mortality rates 0.907545 0.907790 0.606775 0.040950 -0.452625 -0.408543 -0.372665 0.178760 -0.081677 1.000000 0.828148 0.999315 0.864052

ESRD patients: Avg. Annual Mortality rates 0.854841 0.854759 0.444925 -0.151035 -0.572187 -0.238108 -0.369639 0.022898 -0.202818 0.828148 1.000000 0.847294 0.995676 Dialysis patients: Total (or %) deaths for target year 0.914524 0.914723 0.603179 0.031982 -0.462540 -0.404490 -0.375241 0.172767 -0.087055 0.999315 0.847294 1.000000 0.880927 Dialysis patients: Avg. Annual Mortality rates 0.878659 0.879439 0.480531 -0.134234 -0.580723 -0.264096 -0.370520 0.050212 -0.208805 0.864052 0.995676 0.880927 1.000000

plt.figure(figsize=(12, 8))

corr = df.corr()
sns.heatmap(corr,
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True);
plt.suptitle('Heatmap, Correlation Actual Intake Amounts Only');
# Without Age Groupsdf_without_ages = df.drop(['age_from', 'age_to', ' Gender'], axis=1)
df_without_ages.head()
df_without_ages.corr()plt.figure(figsize=(12, 8))

corr = df_without_ages.corr()
sns.heatmap(corr,
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True);
plt.suptitle('Heatmap, Correlation Actual Intake Amounts Only');
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/univariate_food_groups_heatmaps_actual_intake_amount' + '.png')

Will use Normalization all data

df = pd.read_csv('no-empty-data-only-actual-mortality_recom_added_group_data_june_9th_gender_based_data_after_processing.csv')
df = df.drop([' Gender'], axis=1)
df.head()
# normalize data
df_normalized = (df - df.mean())/ (df.max() - df.min())
df_normalized.head()
# df.head()
df_normalized.corr()

age_from age_to Actual Vegetable Intake Actual Protein Intake Actual Grain Intake Actual Dairy Intake Actual Fruit intakes Actual Taken Sugars sweets and beverages amount Avg Fats oils and salad dressings taken ESRD patients: Total (or %) deaths for target year ESRD patients: Avg. Annual Mortality rates Dialysis patients: Total (or %) deaths for target year Dialysis patients: Avg. Annual Mortality rates ESRD patients: Total (or %) deaths for target year 0.907545 0.907790 0.606775 0.040950 -0.452625 -0.408543 -0.372665 0.178760 -0.081677 1.000000 0.828148 0.999315 0.864052 ESRD patients: Avg. Annual Mortality rates 0.854841 0.854759 0.444925 -0.151035 -0.572187 -0.238108 -0.369639 0.022898 -0.202818 0.828148 1.000000 0.847294 0.995676 Dialysis patients: Total (or %) deaths for target year 0.914524 0.914723 0.603179 0.031982 -0.462540 -0.404490 -0.375241 0.172767 -0.087055 0.999315 0.847294 1.000000 0.880927 Dialysis patients: Avg. Annual Mortality rates 0.878659 0.879439 0.480531 -0.134234 -0.580723 -0.264096 -0.370520 0.050212 -0.208805 0.864052 0.995676 0.880927 1.000000

plt.figure(figsize=(12, 8))

corr = df_normalized.corr()
sns.heatmap(corr,
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True);
plt.suptitle('Heatmap, Correlation Actual Intake Amounts Only');
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/univariate_food_groups_heatmaps_normalized_actual_intake_amount' + '.png')

Will check only on the difference from average recommended amount

df = pd.read_csv('copy-only-diff-no-empty-data-diff-recomm-mortality_recom_added_group_data_june_9th_gender_based_data_after_processing .csv')
df.head()
df.describe()# find correlation with price after standardization
df.corr()

age_from age_to Actual Vegetable Intake Actual Protein Intake Actual Grain Intake Actual Dairy Intake Actual Fruit intakes Actual Taken Sugars sweets and beverages amount Avg Fats oils and salad dressings taken Diff Vegetable diff protein diff grain diff dairy diff fruit ESRD patients: Total (or %) deaths for target year ESRD patients: Avg. Annual Mortality rates Dialysis patients: Total (or %) deaths for target year Dialysis patients: Avg. Annual Mortality rates ESRD patients: Total (or %) deaths for target year

0.907545 0.907790 0.606775 0.040950 -0.452625 -0.408543 -0.372665 0.178760 -0.081677 0.426499 -0.273596 -0.268453 -0.364665 -0.488155 1.000000 0.828148 0.999315 0.864052

ESRD patients: Avg. Annual Mortality rates 0.854841 0.854759 0.444925 -0.151035 -0.572187 -0.238108 -0.369639 0.022898 -0.202818 0.359590 -0.266776 -0.263227 -0.213864 -0.392090 0.828148 1.000000 0.847294 0.995676

Dialysis patients: Total (or %) deaths for target year 0.914524 0.914723 0.603179 0.031982 -0.462540 -0.404490 -0.375241 0.172767 -0.087055 0.424340 -0.278817 -0.272938 -0.362140 -0.489873 0.999315 0.847294 1.000000 0.880927

Dialysis patients: Avg. Annual Mortality rates 0.878659 0.879439 0.480531 -0.134234 -0.580723 -0.264096 -0.370520 0.050212 -0.208805 0.401294 -0.244088 -0.246893 -0.229239 -0.387759 0.864052 0.995676 0.880927 1.000000

plt.figure(figsize=(12, 8))

corr = df.corr()
sns.heatmap(corr,
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True);
plt.suptitle('Heatmap, Correlation Actual Intake Variables, and Difference from Recommended Intake Variables');
df.head()
df = df.drop([' Gender'], axis=1)
df.head()
# normalize data

df_normalized = (df - df.mean())/ (df.max() - df.min())
df_normalized.head()
#df.head()
df_normalized.corr()plt.figure(figsize=(12, 8))

corr = df_normalized.corr()
sns.heatmap(corr,
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True);
plt.suptitle('Heatmap, Correlation Actual Intake Variables, and Difference from Recommended Intake Variables');
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/univariate_food_groups_heatmaps_diff_intake_amount_normalized' + '.png')

Bivariate

The most important for Bivariate: bivariate_diff_norm.png and bivariate_diff_norm_rate_only.png. Difference in intake amounts from recommended, also normalized.

Bivariate plots on actual amount intake and target variables. will be saved in bivariate.png. The correlation pattern can be checked in the image saved

Correlation within the intake amounts do not say much as we are using actual intake amounts not differences with recomended amount. Also because, higher aged people will take higher amount. Also, normalized data will give true relation.

The correlation was shown above. Now linearity and non-linearity can be seen using the Bivariate plots.

The plots show similarity with correlation

import numpy as np 
import pandas as pd

from IPython.display import display, HTML

from matplotlib import pyplot as plt
import matplotlib.patches as mpatches

import seaborn as sns
%matplotlib inline

# ref in ref section

On Actual intake Amounts

df = pd.read_csv('no-empty-data-only-actual-mortality_recom_added_group_data_june_9th_gender_based_data_after_processing.csv')
#df_actual = df.drop([' Gender', 'age_from', 'age_to',' ESRD patients: Avg. Annual Mortality rates', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
#df_actual = df.drop([' Gender', 'age_from', 'age_to',' ESRD patients: Avg. Annual Mortality rates', ' ESRD patients: Total (or %) deaths for target year', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
df_actual = df.drop([' Gender', 'age_from', 'age_to'], axis=1)
df_actual.head()
# on actual amounts
#plt.figure(figsize=(16, 300))
sns.pairplot(df_actual, vars=df_actual.columns, size=5, kind='reg'); # diag_kind='kde',
plt.title('Bivariate Plot, All Actual Taken Variables, Total ESRD target variable');
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/bivariate_food_group' + '.png')
plt.show()

On difference from recommended amount (data not normalized)

df = pd.read_csv('copy-only-diff-no-actual-no-empty-cell-diff-recomm-mortality_recom_added_group_data_june_9th_gender_based_data_after_processing .csv')
df.head()
# On difference from recommended amount (data not normalized)

#df_actual_diff = df.drop(['age_from', 'age_to',' ESRD patients: Avg. Annual Mortality rates', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
#df_actual_diff.head()
#df_actual = df.drop([' Gender', 'age_from', 'age_to',' ESRD patients: Avg. Annual Mortality rates', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
#df_actual = df.drop([' Gender', 'age_from', 'age_to',' ESRD patients: Avg. Annual Mortality rates', ' ESRD patients: Total (or %) deaths for target year', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
df_actual_diff = df.drop(['age_from', 'age_to'], axis=1)
df_actual_diff.head()
#df.head()
# On difference from recommended amount (data not normalized)

sns.pairplot(df_actual_diff, vars=df_actual_diff.columns, size=5, kind='reg');
#plt.title('Bivariate Plot, All Actual Taken Variables, Total ESRD target variable');
#plt.savefig('./saved_images_from_visualizations/' + 'line_' +indicator.replace(' ', '_')[0:10] + '_' + str(np.random.randint(0, 99999)) + '.png')
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/bivariate_diff' + '.png')
plt.show()

On difference from recommended amount (data/diff normalized)

df_normalized_diff = (df_actual_diff - df_actual_diff.mean())/ (df_actual_diff.max() - df_actual_diff.min())
df_normalized_diff.head()

sns.pairplot(df_normalized_diff, vars=df_normalized_diff.columns, size=5, kind='reg', aspect=1); # diag_kind='kde',
#plt.title('Bivariate Plot, All Actual Taken Variables, Total ESRD target variable');
#plt.savefig('./saved_images_from_visualizations/' + 'line_' +indicator.replace(' ', '_')[0:10] + '_' + str(np.random.randint(0, 99999)) + '.png')
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/bivariate_diff_norm' + '.png')
plt.show()
df_normalized_diff = df_normalized_diff.drop([' ESRD patients: Total (or %) deaths for target year', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates'], axis=1)
df_normalized_diff.head()
sns.pairplot(df_normalized_diff, vars=df_normalized_diff.columns, size=5, kind='reg', aspect=1); # diag_kind='kde',
plt.suptitle('Bivariate : Diff : Food Group: Normalized\n')
plt.ylabel('Difference in Intake amount from Recommended : Normalized')
plt.xlabel('Intakes')
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/bivariate_diff_norm_rate_only' + '.png')
plt.show()
# can be ignored
"""
df_esrdonly = df.drop(['age_from', 'age_to',' ESRD patients: Avg. Annual Mortality rates', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
df_esrdonly.head()

plt.figure(figsize=(14, 14))
sns.pairplot(df_esrdonly, diag_kind='kde');
plt.xlabel('Bivariate Plot, Difference from Recommended Variables, Total ESRD target variable');

#plt.figure(figsize=(16, 16))
plt.rcParams['figure.figsize'] = 16, 16
pd.plotting.scatter_matrix(df_esrdonly, diagonal="kde")
plt.suptitle('Scatter Matrix');
plt.show();
"""

PCA

Applying PCA on Difference data

df_esrdonly = df.drop(['age_from', 'age_to',' ESRD patients: Total (or %) deaths for target year', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
df_esrdonly.head()
#df_esrdonly.head()
# steps followed from ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html

import sklearn
from sklearn import preprocessing
standardisedX = sklearn.preprocessing.scale(df_esrdonly)
standardisedX = pd.DataFrame(standardisedX, index=df_esrdonly.index, columns=df_esrdonly.columns)
standardisedX.apply(np.mean)
standardisedX.apply(np.std)from sklearn import decomposition
pca = decomposition.PCA(n_components=2).fit(standardisedX)
pca = decomposition.PCA().fit(standardisedX)
pca
#ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html
def pca_summary(pca, standardised_data, out=True):
names = ["PC"+str(i) for i in range(1, len(pca.explained_variance_ratio_)+1)]
a = list(np.std(pca.transform(standardised_data), axis=0))
b = list(pca.explained_variance_ratio_)
c = [np.sum(pca.explained_variance_ratio_[:i]) for i in range(1, len(pca.explained_variance_ratio_)+1)]
columns = pd.MultiIndex.from_tuples([("sdev", "Standard deviation"), ("varprop", "Proportion of Variance"), ("cumprop", "Cumulative Proportion")])
summary = pd.DataFrame( list(zip(a, b, c)), index=names, columns=columns)
if out:
print("Importance of components:")
display(summary)
return summary
summary = pca_summary(pca, standardisedX)# First two component can define over 95%np.sum(summary.sdev**2)# ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html
plt.rcParams['figure.figsize'] = 8, 8

def screeplot(pca, standardised_values):
y = np.std(pca.transform(standardised_values), axis=0)**2
x = np.arange(len(y)) + 1
plt.plot(x, y, "o-")
plt.xticks(x, ["Comp."+str(i) for i in x], rotation=60)
plt.ylabel("Variance")
plt.title('PCA variance in Components')
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_components_variance' + '.png')
plt.show()



screeplot(pca, standardisedX)
#plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_components_variance' + '.png')
# comp 2 to comp 3 is the most change - slope
# first two or at best first 3 can be retained
summary.sdev**2

Can be retained

PC1 4.263124 PC2 1.449435

pca.components_[0]np.sum(pca.components_[0]**2)# ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html
# not my code, I am using this as (similar to) a library function
def calcpc(variables, loadings):
# find the number of samples in the data set and the number of variables
numsamples, numvariables = variables.shape
# make a vector to store the component
pc = np.zeros(numsamples)
# calculate the value of the component for each sample
for i in range(numsamples):
valuei = 0
for j in range(numvariables):
valueij = variables.iloc[i, j]
loadingj = loadings[j]
valuei = valuei + (valueij * loadingj)
pc[i] = valuei
return pc
calcpc(standardisedX, pca.components_[0])pca.transform(standardisedX)[:, 0]pca.components_[1]np.sum(pca.components_[1]**2)

highest loadings for

# for following code : Classes will/might be defined such as like High, low, neutral mortality in final workdf_esrdonly[' ESRD patients: Avg. Annual Mortality rates']
sorted(standardisedX[' ESRD patients: Avg. Annual Mortality rates'])
# Define high and low mortalityy = df_esrdonly[' ESRD patients: Avg. Annual Mortality rates'] #< 0.5
# from normalized data. > 0.5 = high mortality
print('as the older age show higher mortality, does it mean th')
y = standardisedX[' ESRD patients: Avg. Annual Mortality rates'] > 0.5
y = standardisedX[' ESRD patients: Avg. Annual Mortality rates'] > 0.5
y
# # ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html
# not my code from the URL above, using this as a library function
def pca_scatter(pca, standardised_values, classifs):
foo = pca.transform(standardised_values)
bar = pd.DataFrame(list(zip(foo[:, 0], foo[:, 1], classifs)), columns=["PC1", "PC2", "Class"])
#plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_components_separating_high_low_mortality' + '.png')
sns.lmplot("PC1", "PC2", bar, hue="Class", fit_reg=False)

#y = df_esrdonly[' ESRD patients: Total (or %) deaths for target year']
#y = np.std(pca.transform(standardisedX), axis=0)**2
pca_scatter(pca, standardisedX, y)

# y can be used as classes like High, low, neutral mortality

# plt.suptitle('Mortality class and Principle components, y can be used as classes like High, low, neutral mortality');
plt.title('Only two components can separate the mortality data. True = High ')
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_components_separating_high_low_mortality' + '.png')
# reference: https://towardsdatascience.com/dive-into-pca-principal-component-analysis-with-python-43ded13ead21
pca_components_cont = pca.components_[0:2]

plt.matshow(pca_components_cont, cmap='viridis')
plt.yticks([0,1],['1st Comp','2nd Comp'],fontsize=10)
plt.colorbar()
#plt.xticks(range(1, len(df_esrdonly.columns)),df_esrdonly.columns[1:len(df_esrdonly.columns)],rotation=65)
plt.xticks(range(len(df_esrdonly.columns)),df_esrdonly.columns,rotation=65,ha='left')
#plt.tight_layout()
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_food_groups_what_contributes_to_PCA_components' + '.png')
plt.show()#
# from the abobe plot, Vegetable, Grain, Protein contribute the most to the 1st component

How the intake differences contributed for the affecting PCA components

diffs = list(df_esrdonly.columns[:-1]) 
import seaborn as sns
s = sns.heatmap(df_esrdonly[diffs].corr(),cmap='coolwarm')
s.set_yticklabels(s.get_yticklabels(),rotation=30,fontsize=7)
s.set_xticklabels(s.get_xticklabels(),rotation=30,fontsize=7)
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_food_groups_how_in_together_influencing_PCA_components' + '.png')
plt.show()

Vegetable by itself is important — contributing factors

protein + grain + then fruit

grain with fruit — slighly with dairy

dairy with protein and grain*

fruit with protein and grain then dairy

References

https://www.kaggle.com/etakla/exploring-the-dataset-univariate-analysis https://www.kaggle.com/etakla/exploring-the-dataset-bivariate-analysis https://towardsdatascience.com/survival-analysis-part-a-70213df21c2e https://lifelines.readthedocs.io/en/latest/ https://lifelines.readthedocs.io/en/latest/Survival%20Regression.html https://www.statsdirect.com/help/survival_analysis/cox_regression.htm https://courses.lumenlearning.com/suny-natural-resources-biometrics/chapter/chapter-7-correlation-and-simple-linear-regression/ https://www.kaggle.com/residentmario/univariate-plotting-with-pandas

#ref https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html #https://www.shanelynn.ie/using-pandas-dataframe-creating-editing-viewing-data-in-python/

https://www.oreilly.com/library/view/practical-statistics-for/9781491952955/ch04.html

https://erc.barnard.edu/spss/pearsons_r

https://www.statpac.com/statistics-calculator/correlation-regression.htm

https://data-flair.training/blogs/python-statistics/

https://www.analyticsvidhya.com/blog/2016/01/guide-data-exploration/

References:

Number of columns to display
https://stackoverflow.com/questions/11707586/how-do-i-expand-the-output-display-to-see-more-columns

Misc to help with Regression

#folder = './data-for-code/' #'C:/Users/Sayed Ahmed/mrp_project_implementation/phase methodology and experiments/excel-xlstat-analysis/acr-data-for-analysis/food-groups/' #/regression_consolidated-food-group-based-data-with-acr-and-blood-pressure-2015-2016.xlsx'
#os.listdir(folder)
file = data_folder + 'regression_consolidated-food-group-based-data-with-acr-and-blood-pressure-2015-2016.xlsx'
#excel_file = pd.ExcelFile(folder + 'regression_consolidated-food-group-based-data-with-acr-and-blood-pressure-2015-2016.xlsx')
file
excel_file = pd.ExcelFile(file)
# see all sheet names
sheet_names = excel_file.sheet_names
sheet_names#[:4]
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 10000)

foodgroup_acr_data_df = excel_file.parse('consolidated-food-group-acr')
foodgroup_acr_data_df.head(3)
df_food_group_only = foodgroup_acr_data_df[ ['participant_id', 'food_group_name', 'avg_food_weight_in_gms'] ]
df_food_group_only.head()
df_food_group_only.Tdf_food_group_only_index = df_food_group_only.set_index(['participant_id', 'food_group_name'])
df_food_group_only_index
df_food_group_only_index_unstack = df_food_group_only_index.unstack()df_food_group_only_index_unstack.head()foodgroup_acr_data_df.columnsdf_without_food_groups = foodgroup_acr_data_df.drop(['food_group_name', 'avg_food_weight_in_gms'], axis=1)
df_without_food_groups.head()
df_without_food_groups_one_entry = df_without_food_groups.groupby(['participant_id']).mean()merged = df_food_group_only_index_unstack.merge(df_without_food_groups_one_entry, how='inner', left_on='participant_id', right_on='participant_id', left_index=True)
merged.head()
merged.shape
merged.head()foodgroup_acr_data_df_t = foodgroup_acr_data_df.T
foodgroup_acr_data_df_t.head(10)
df_multi = foodgroup_acr_data_df.set_index(['participant_id', 'food_group_name'])
df_multi
df_multi = df_multi.unstack()df_multi.head()df_some = foodgroup_acr_data_df[['participant_id', 'participant_age']]
df_some.head()
df_some.merge(df_multi, how='left', left_on='participant_id', right_on='participant_id')Ref: https://hackernoon.com/reshaping-data-in-python-fa27dda2ff77# backup

#folder = './data-for-code/' #'C:/Users/Sayed Ahmed/mrp_project_implementation/phase methodology and experiments/excel-xlstat-analysis/acr-data-for-analysis/food-groups/' #/regression_consolidated-food-group-based-data-with-acr-and-blood-pressure-2015-2016.xlsx'
#os.listdir(folder)

file = data_folder + 'regression_consolidated-food-group-based-data-with-acr-and-blood-pressure-2015-2016.xlsx'
#excel_file = pd.ExcelFile(folder + 'regression_consolidated-food-group-based-data-with-acr-and-blood-pressure-2015-2016.xlsx')
file


excel_file = pd.ExcelFile(file)
# see all sheet names
sheet_names = excel_file.sheet_names
sheet_names#[:4]


pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 10000)

foodgroup_acr_data_df = excel_file.parse('consolidated-food-group-acr')
foodgroup_acr_data_df.head(3)


df_food_group_only = foodgroup_acr_data_df[ ['participant_id', 'food_group_name', 'avg_food_weight_in_gms'] ]
df_food_group_only.head()

df_food_group_only.T

df_food_group_only_index = df_food_group_only.set_index(['participant_id', 'food_group_name'])
df_food_group_only_index

df_food_group_only_index_unstack = df_food_group_only_index.unstack()

df_food_group_only_index_unstack.head()

foodgroup_acr_data_df.columns

df_without_food_groups = foodgroup_acr_data_df.drop(['food_group_name', 'avg_food_weight_in_gms'], axis=1)
df_without_food_groups.head()

df_without_food_groups_one_entry = df_without_food_groups.groupby(['participant_id']).mean()

merged = df_food_group_only_index_unstack.merge(df_without_food_groups_one_entry, how='inner', left_on='participant_id', right_on='participant_id', left_index=True)
merged.head()
merged.shape

merged.head()

merged.to_csv(data_folder + 'food_group_data_ready_for_excel_regression.csv')


foodgroup_acr_data_df_t = foodgroup_acr_data_df.T
foodgroup_acr_data_df_t.head(10)
https://hackernoon.com/reshaping-data-in-python-fa27dda2ff77
https://pandas.pydata.org/pandas-docs/stable/reference/frame.html

--

--

Justetc Social Services (non-profit)
Data Science Project Development

All proceeds from Medium will go to Justetc Social Services ( non-profit). Justetc Social Services provides services in the Training and Education Areas.