Food Subgroups
Reusing the code from Exploration Phase. The code and comments will be updated as proceeded. The code and comments might not be perfect until finalized
# PCA section, Correlation, Heatmaps, can be seen to be the most important parts
# Might have to adjust/improve my conclusions from the plots in future work# Target variable ACR i.e. Albumin Creatinine Ratio
# Other Probable targets: Kidney failed, Systolic/Diastolic Pressure
# Tried the following Target variables previously that might not even be present now. Though might bring them back if seem to be important
# ESRD patients: Avg. Annual Mortality rates is given the most importance
# ESRD patients: Total (or %) deaths for target year
# Dialysis patients: Total (or %) deaths for target year# For plotting, import libraries
from matplotlib import pyplot as plt
import matplotlib
# import seaborn as sns
%matplotlib inline
import pandas as pd
import numpy as np
from subprocess import check_output
from IPython.display import display, HTML
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
import seaborn as snsimport warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)import os
#data_folder = './acr-data-for-analysis/food-subgroups/'
data_folder = './data-for-code/'
data_files = os.listdir(data_folder)
data_files['average_intake_by_food_subgroups_demo_acr_kidney_blood_pressure.xlsx',
'food-subgroups-subgroup-names-on-columns.csv',
'more-col-recom-amt-2015-2016-consolidated-food-group-based-data-with-acr-and-blood-pressure.xlsx',
'regression_consolidated-food-group-based-data-with-acr-and-blood-pressure-2015-2016.xlsx',
'regression_variables_after_pca_saved_for_factor_transformations.csv',
'regression_variables_after_pca_saved_for_factor_transformations_food_subgroup.csv',
'transformed_regression_variables_after_pca_saved_for_factor_transformations.csv']data_file = data_folder + 'average_intake_by_food_subgroups_demo_acr_kidney_blood_pressure.xlsx'
excel_file = pd.ExcelFile(data_file)# see all sheet names
sheet_names = excel_file.sheet_names
sheet_names[:4]['Sheet1']pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)
foodgroup_acr_data_df = excel_file.parse('Sheet1')
foodgroup_acr_data_df.head(5)
foodgroup_acr_data_df.describe()
PCA: Apply PCA to find important features
#foodgroup_acr_data_df['food_group_id']df_food_group_only = foodgroup_acr_data_df[ ['participant_id', 'food_group_name', 'avg_food_weight_in_gms'] ]
df_food_group_only.head()
#df_food_group_only.T
df_food_group_only_index = df_food_group_only.set_index(['participant_id', 'food_group_name'])
df_food_group_only_index
df_food_group_only_index_unstack = df_food_group_only_index.unstack()
df_food_group_only_index_unstack.head()
foodgroup_acr_data_df.columns
df_without_food_groups = foodgroup_acr_data_df.drop(['food_group_name', 'food_group_id', 'avg_food_weight_in_gms'], axis=1)
df_without_food_groups.head()
df_without_food_groups_one_entry = df_without_food_groups.groupby(['participant_id']).mean()
df_without_food_groups_one_entry.head()
merged = df_food_group_only_index_unstack.merge(df_without_food_groups_one_entry, how='inner', left_on='participant_id', right_on='participant_id', left_index=True)
merged.head()
merged.shape
merged.head()
# remove extra spaces in the column names
column_dict = {}
for aName in merged.columns:
#print(aName)
try:
#if ( aName.index( 'avg_food_weight_in_gms' ) >= 0):
if 'avg_food_weight_in_gms' in aName:
#aName =
column_dict[aName] = aName[1] #aName.strip()
else:
column_dict[aName] = aName #.strip()
except:
print(aName)
continue
# column_dictmerged = merged.rename(index=str, columns=column_dict)
merged.head()
merged_f = merged.fillna(0)
merged_f.to_csv(data_folder + 'food-subgroups-subgroup-names-on-columns.csv')regr_data = pd.read_csv(data_folder + 'regression_variables_after_pca_saved_for_factor_transformations.csv')
regr_data_t = regr_data.T
regr_data_t.to_csv(data_folder + 'transformed_regression_variables_after_pca_saved_for_factor_transformations.csv')merged.columnsIndex(['Added Sugars/Sugars and sweets', 'Alcoholic beverages',
'Citrus fruits, juices', 'Dark-green vegetables', 'Dried fruits',
'Fruit juices and nectars excluding citrus',
'Fruits and juices baby food', 'Meat, Poultry and Eggs subgroup',
'Milk desserts, sauces, gravies', 'Milks and milk drinks ',
'Nonalcoholic beverages', 'Nuts, Seeds, and Soy Products', 'Oils',
'Other fruits', 'Other vegetables', 'Protein',
'Red and orange vegetables', 'Refined grains', 'Seafood', 'Solid Fats',
'Starchy vegetables', 'Water, noncarbonated', 'Whole grains',
'participant_age', 'gender', 'acr', 'kidney_failed',
'systolic_pressure', 'diastolic_pressure', 'calorie', 'protein',
'sodium', 'carbohydrate', 'sugar', 'fibre', 'fat', 'saturated_fat',
'mono_fat', 'poly_fat', 'cholesterol', 'calcium', 'phosphorous',
'magnesium', 'potassium', 'alcohol', 'a_sample_food_code',
'albumin_urine_mu_g', 'albumin_urine_mg', 'creatinine_mu_mol',
'creatinine_mg', 'received_dialysis_in_12_months', 'kidney_stones',
'passed_kidney_stones_12_months', 'urinary_leakage_frequency',
'urine_lose_each_time', 'leak_during_activities',
'how_frequent_leak_occurs', 'urinated_before_reaching_toilet',
'how_frequent', 'leak_during_nonphysical_activities',
'how_frequest_leak_nonphysical', 'how_much_leak_bothering',
'how_much_daily_activities_affected', 'count_night_time_urinate'],
dtype='object')merged.columnsIndex(['Added Sugars/Sugars and sweets', 'Alcoholic beverages',
'Citrus fruits, juices', 'Dark-green vegetables', 'Dried fruits',
'Fruit juices and nectars excluding citrus',
'Fruits and juices baby food', 'Meat, Poultry and Eggs subgroup',
'Milk desserts, sauces, gravies', 'Milks and milk drinks ',
'Nonalcoholic beverages', 'Nuts, Seeds, and Soy Products', 'Oils',
'Other fruits', 'Other vegetables', 'Protein',
'Red and orange vegetables', 'Refined grains', 'Seafood', 'Solid Fats',
'Starchy vegetables', 'Water, noncarbonated', 'Whole grains',
'participant_age', 'gender', 'acr', 'kidney_failed',
'systolic_pressure', 'diastolic_pressure', 'calorie', 'protein',
'sodium', 'carbohydrate', 'sugar', 'fibre', 'fat', 'saturated_fat',
'mono_fat', 'poly_fat', 'cholesterol', 'calcium', 'phosphorous',
'magnesium', 'potassium', 'alcohol', 'a_sample_food_code',
'albumin_urine_mu_g', 'albumin_urine_mg', 'creatinine_mu_mol',
'creatinine_mg', 'received_dialysis_in_12_months', 'kidney_stones',
'passed_kidney_stones_12_months', 'urinary_leakage_frequency',
'urine_lose_each_time', 'leak_during_activities',
'how_frequent_leak_occurs', 'urinated_before_reaching_toilet',
'how_frequent', 'leak_during_nonphysical_activities',
'how_frequest_leak_nonphysical', 'how_much_leak_bothering',
'how_much_daily_activities_affected', 'count_night_time_urinate'],
dtype='object')df_drop_pca = merged[
[ 'Added Sugars/Sugars and sweets',
'Alcoholic beverages',
'Citrus fruits, juices',
'Dark-green vegetables',
'Dried fruits',
'Fruit juices and nectars excluding citrus',
'Fruits and juices baby food',
'Meat, Poultry and Eggs subgroup',
'Milk desserts, sauces, gravies',
'Milks and milk drinks ',
'Nonalcoholic beverages',
'Nuts, Seeds, and Soy Products', 'Oils',
'Other fruits',
'Other vegetables',
'Protein',
'Red and orange vegetables',
'Refined grains',
'Seafood',
'Solid Fats',
'Starchy vegetables',
'Water, noncarbonated',
'Whole grains',
#'participant_age',
#'gender',
#'acr',
#'kidney_failed',
#'systolic_pressure',
#'diastolic_pressure',
#'calorie',
#'protein',
#'sodium',
#'carbohydrate',
#'sugar',
#'fibre',
#'fat',
#'saturated_fat',
#'mono_fat',
#'poly_fat',
#'cholesterol',
#'calcium',
#'phosphorous',
#'magnesium',
#'potassium',
#'alcohol',
#'a_sample_food_code',
#'albumin_urine_mu_g', 'albumin_urine_mg', 'creatinine_mu_mol',
#'creatinine_mg', 'received_dialysis_in_12_months', 'kidney_stones',
#'passed_kidney_stones_12_months', 'urinary_leakage_frequency',
#'urine_lose_each_time', 'leak_during_activities',
#'how_frequent_leak_occurs', 'urinated_before_reaching_toilet',
#'how_frequent', 'leak_during_nonphysical_activities',
#'how_frequest_leak_nonphysical', 'how_much_leak_bothering',
#'how_much_daily_activities_affected', 'count_night_time_urinate'
]
]# removing text features and/or not that important features
#foodgroup_acr_data_df
#df_drop_pca = merged #.drop([ 'food_group_from_recom_amount', 'm_food_group_name', 'a_sample_food', 'a_sample_food_name'], axis=1)
#df_drop_pca = df_drop_pca.drop(['acr'], axis=1)
# 'participant_age', 'gender',
df_drop_pca.fillna(0, inplace=True)
df_drop_pca.head(10)
# steps followed from ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html
import sklearn
from sklearn import preprocessing
standardisedX = sklearn.preprocessing.scale(df_drop_pca)
standardisedX = pd.DataFrame(standardisedX, index=df_drop_pca.index, columns=df_drop_pca.columns)
standardisedX.apply(np.mean)Added Sugars/Sugars and sweets 2.091783e-16
Alcoholic beverages 3.822242e-16
Citrus fruits, juices 1.694832e-17
Dark-green vegetables 9.322008e-16
Dried fruits 1.785011e-15
Fruit juices and nectars excluding citrus -2.310308e-16
Fruits and juices baby food 7.944569e-16
Meat, Poultry and Eggs subgroup -8.868298e-17
Milk desserts, sauces, gravies -3.878713e-16
Milks and milk drinks -1.797387e-16
Nonalcoholic beverages 6.543493e-17
Nuts, Seeds, and Soy Products 1.522320e-16
Oils 9.512190e-16
Other fruits -2.575567e-16
Other vegetables -3.786687e-17
Protein 1.673196e-16
Red and orange vegetables -3.572561e-16
Refined grains -1.664181e-16
Seafood 4.954679e-16
Solid Fats -4.734747e-16
Starchy vegetables -6.223422e-16
Water, noncarbonated -4.369421e-17
Whole grains 9.678572e-17
dtype: float64standardisedX.apply(np.std)
from sklearn import decomposition
#pca = decomposition.PCA(n_components=2).fit(standardisedX)
pca = decomposition.PCA().fit(standardisedX)
pcaPCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
svd_solver='auto', tol=0.0, whiten=False)#ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html
def pca_summary(pca, standardised_data, out=True):
names = ["PC"+str(i) for i in range(1, len(pca.explained_variance_ratio_)+1)]
a = list(np.std(pca.transform(standardised_data), axis=0))
b = list(pca.explained_variance_ratio_)
c = [np.sum(pca.explained_variance_ratio_[:i]) for i in range(1, len(pca.explained_variance_ratio_)+1)]
columns = pd.MultiIndex.from_tuples([("sdev", "Standard deviation"), ("varprop", "Proportion of Variance"), ("cumprop", "Cumulative Proportion")])
summary = pd.DataFrame( list(zip(a, b, c)), index=names, columns=columns)
if out:
print("Importance of components:")
display(summary)
return summarysummary = pca_summary(pca, standardisedX)Importance of components:
First 22 components can define 83%, 26 Comp over 90%
# ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html
def screeplot(pca, standardised_values):
y = np.std(pca.transform(standardised_values), axis=0)**2
x = np.arange(len(y)) + 1
plt.plot(x, y, "o-")
plt.xticks(x, ["Comp."+str(i) for i in x], rotation=60)
plt.ylabel("Variance")
plt.title('PCA variance in Components')
plt.savefig('pca_components_variance' + '.png')
plt.show()plt.rcParams['figure.figsize'] = 10, 10
screeplot(pca, standardisedX)
plt.savefig('./images/pca_components_variance_food_subgroup' + '.png')
<Figure size 720x720 with 0 Axes>
comp 3 to comp 4 is the most change — slope
first three can be retained
Other big slope changes at:5 to 6, 8 to 9, 25 to 26, 30 to 31
Though upto 3 can be significant, upto 26 can be experimented with, or even just upto 8
just upto 8 is preferred
# ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html
# not my code, I am using this as (similar to) a library function
def calcpc(variables, loadings):
# find the number of samples in the data set and the number of variables
numsamples, numvariables = variables.shape
# make a vector to store the component
pc = np.zeros(numsamples)
# calculate the value of the component for each sample
for i in range(numsamples):
valuei = 0
for j in range(numvariables):
valueij = variables.iloc[i, j]
loadingj = loadings[j]
valuei = valuei + (valueij * loadingj)
pc[i] = valuei
return pccalcpc(standardisedX, pca.components_[0])array([ 0.4024334 , 2.86288165, 0.63548882, ..., -0.27547279,
-0.40052422, 0.01408633])pca.transform(standardisedX)[:, 0]array([ 0.4024334 , 2.86288165, 0.63548882, ..., -0.27547279,
-0.40052422, 0.01408633])pca.components_[1]array([-0.04013536, -0.21662898, 0.2646046 , 0.2096937 , 0.15558643,
0.23267299, -0.01198426, -0.10661167, 0.24079041, 0.30859506,
-0.15275508, 0.22375509, 0.14752615, 0.45938378, 0.30101746,
0.23534633, 0.25186201, 0.19465321, 0.16763907, 0.08938424,
0.06329616, 0.01186119, -0.02546397])np.sum(pca.components_[1]**2)1.0000000000000004# for following code : Classes will/might be defined such as like High, low, neutral mortality in final work
#df_drop_pca['acr']
#sorted(standardisedX['acr'])"""
y = df_drop_pca['acr'] #< 0.5
#y
dbk= np.where(df_drop_pca['acr'] < 30, 'normal-mild', df_drop_pca['acr'])
dbk = np.where(df_drop_pca['acr'] > 30, 'moderate', df_drop_pca['acr'])
dbk
"""
y = []
for aVal in merged['acr']:
if aVal < 30:
y.append('normal-mild')
if aVal > 300:
y.append('severe')
elif aVal < 150:
y.append('mild-moderate-low')
elif aVal > 150:
#print(aVal)
y.append('mild-moderate-high')
y['normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-high',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-high',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-high',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-high',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-high',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-high',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-high',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'severe',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
'normal-mild',
'mild-moderate-low',
...]# # ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html
# not my code from the URL above, using this as a library function
def pca_scatter(pca, standardised_values, classifs):
foo = pca.transform(standardised_values)
bar = pd.DataFrame(list(zip(foo[:, 0], foo[:, 1], classifs)), columns=["PC1", "PC2", "Class"])
#plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_components_separating_high_low_mortality' + '.png')
sns.lmplot("PC1", "PC2", bar, hue="Class", fit_reg=False)
#y = df_esrdonly[' ESRD patients: Total (or %) deaths for target year']
#y = np.std(pca.transform(standardisedX), axis=0)**2
plt.rcParams['figure.figsize'] = 12, 12
pca_scatter(pca, standardisedX, y)
#plt.title('Only two components can separate the mortality data. True = High ')
#plt.savefig('pca_components_separating_high_low_mortality' + '.png')# reference: https://towardsdatascience.com/dive-into-pca-principal-component-analysis-with-python-43ded13ead21
plt.rcParams['figure.figsize'] = 30, 16
components_to_count = 3
pca_components_cont = pca.components_[0:components_to_count]
ylabels = []
for c in range (components_to_count):
ylabels.append('Component ' + str(c) )
xlabels = []
for c in range (components_to_count):
xlabels.append(c)
plt.matshow(pca_components_cont, cmap='viridis')
plt.yticks(xlabels, ylabels, fontsize=10)
plt.colorbar()
plt.xticks(range(len(df_drop_pca.columns)), df_drop_pca.columns, rotation=90, ha='left')
plt.show()#
# Negative: Refined grain, Milk drink, solid fat# reference: https://towardsdatascience.com/dive-into-pca-principal-component-analysis-with-python-43ded13ead21
plt.rcParams['figure.figsize'] = 30, 16
components_to_count = 8
pca_components_cont = pca.components_[0:components_to_count]
ylabels = []
for c in range (components_to_count):
ylabels.append('Component ' + str(c) )
xlabels = []
for c in range (components_to_count):
xlabels.append(c)
plt.matshow(pca_components_cont, cmap='viridis')
plt.yticks(xlabels, ylabels, fontsize=10)
plt.colorbar()
plt.xticks(range(len(df_drop_pca.columns)), df_drop_pca.columns, rotation=90, ha='left')
plt.show()#
From 3 components: Non alc, other fruit, dark green alc, seafood, milk
1st component:Non alcoholic, Meat, Alcoholic, starchy vegetable, : water, whole grain, solid fat, protein 2nd Component: Other fruits, milks,
3rd component: Dark Green, Seafood
From 7 components: Fruit and Juices, non alc, nuts, oils, other fruits, meat, alc, dark v protein, refined, whole grain, fat
selection for regression
regression_variables = merged[
[
#'Added Sugars/Sugars and sweets', :
'Alcoholic beverages',
#'Citrus fruits, juices',
'Dark-green vegetables',
#'Dried fruits',
#'Fruit juices and nectars excluding citrus',
'Fruits and juices baby food',
'Meat, Poultry and Eggs subgroup',
'Milk desserts, sauces, gravies',
'Milks and milk drinks ',
'Nonalcoholic beverages',
'Nuts, Seeds, and Soy Products', 'Oils',
'Other fruits',
#'Other vegetables',
'Protein',
#'Red and orange vegetables',
'Refined grains',
'Seafood',
'Solid Fats',
'Starchy vegetables',
'Water, noncarbonated',
'Whole grains',
#'participant_age',
#'gender',
'acr',
'kidney_failed',
'systolic_pressure',
'diastolic_pressure'
]
]regression_variables.corr()
plt.figure(figsize=(16, 16))
corr = regression_variables.corr()
sns.heatmap(corr,
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True,
cmap="BuPu");
plt.suptitle('Heatmap, Correlation All Variables');
plt.savefig('./images/regression_food_subgroup_and_acr.png')
Affecting: ACR: Milk dessert, meat and Poultry, Alcohol, High Blood Pressure related Kidney failed: dark green vegetable, Milk Dessert
Code below are not part of methodology and experiment part
# The above plot kind of shows the correlation with CKD and ACR
df_drop_pca.columnsIndex(['Added Sugars/Sugars and sweets', 'Alcoholic beverages',
'Citrus fruits, juices', 'Dark-green vegetables', 'Dried fruits',
'Fruit juices and nectars excluding citrus',
'Fruits and juices baby food', 'Meat, Poultry and Eggs subgroup',
'Milk desserts, sauces, gravies', 'Milks and milk drinks ',
'Nonalcoholic beverages', 'Nuts, Seeds, and Soy Products', 'Oils',
'Other fruits', 'Other vegetables', 'Protein',
'Red and orange vegetables', 'Refined grains', 'Seafood', 'Solid Fats',
'Starchy vegetables', 'Water, noncarbonated', 'Whole grains'],
dtype='object')df_drop_pca_important = df_drop_pca[
[ 'Dairy', 'Fats, oils, and salad dressings', 'Fruits', 'Grains',
'Protein', 'Sugars, sweets, and beverages', 'Vegetables',
'participant_age', 'gender',
#'age_from_recom_amount',
#'avg_taken_from_recom_amount',
#'recommended_low',
#'recommended_high',
'acr',
'kidney_failed', 'systolic_pressure', 'diastolic_pressure',
'calorie', 'protein', 'sodium', 'carbohydrate', 'sugar', 'fibre', 'fat',
'saturated_fat', 'mono_fat', 'poly_fat', 'cholesterol', 'calcium',
'phosphorous', 'magnesium', 'potassium', 'alcohol',
#'a_sample_food_code',
'albumin_urine_mu_g', 'albumin_urine_mg',
'creatinine_mu_mol', 'creatinine_mg',
#'received_dialysis_in_12_months',
'kidney_stones', 'passed_kidney_stones_12_months',
'urinary_leakage_frequency', 'urine_lose_each_time',
'leak_during_activities', 'how_frequent_leak_occurs',
'urinated_before_reaching_toilet', 'how_frequent',
'leak_during_nonphysical_activities', 'how_frequest_leak_nonphysical',
'how_much_leak_bothering', 'how_much_daily_activities_affected'
'count_night_time_urinate'
]
]---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-44-b95dd96c8f16> in <module>()
23 'urinated_before_reaching_toilet', 'how_frequent',
24 'leak_during_nonphysical_activities', 'how_frequest_leak_nonphysical',
---> 25 'how_much_leak_bothering', 'how_much_daily_activities_affected'
26 'count_night_time_urinate'
27 ]
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pandas\core\frame.py in __getitem__(self, key)
2677 if isinstance(key, (Series, np.ndarray, Index, list)):
2678 # either boolean or fancy integer index
-> 2679 return self._getitem_array(key)
2680 elif isinstance(key, DataFrame):
2681 return self._getitem_frame(key)
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pandas\core\frame.py in _getitem_array(self, key)
2721 return self._take(indexer, axis=0)
2722 else:
-> 2723 indexer = self.loc._convert_to_indexer(key, axis=1)
2724 return self._take(indexer, axis=1)
2725
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pandas\core\indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
1325 if mask.any():
1326 raise KeyError('{mask} not in index'
-> 1327 .format(mask=objarr[mask]))
1328
1329 return com._values_from_object(indexer)
KeyError: "['Dairy' 'Fats, oils, and salad dressings' 'Fruits' 'Grains'\n 'Sugars, sweets, and beverages' 'Vegetables' 'participant_age' 'gender'\n 'acr' 'kidney_failed' 'systolic_pressure' 'diastolic_pressure' 'calorie'\n 'protein' 'sodium' 'carbohydrate' 'sugar' 'fibre' 'fat' 'saturated_fat'\n 'mono_fat' 'poly_fat' 'cholesterol' 'calcium' 'phosphorous' 'magnesium'\n 'potassium' 'alcohol' 'albumin_urine_mu_g' 'albumin_urine_mg'\n 'creatinine_mu_mol' 'creatinine_mg' 'kidney_stones'\n 'passed_kidney_stones_12_months' 'urinary_leakage_frequency'\n 'urine_lose_each_time' 'leak_during_activities'\n 'how_frequent_leak_occurs' 'urinated_before_reaching_toilet'\n 'how_frequent' 'leak_during_nonphysical_activities'\n 'how_frequest_leak_nonphysical' 'how_much_leak_bothering'\n 'how_much_daily_activities_affectedcount_night_time_urinate'] not in index"df_drop_pca_important.corr()plt.figure(figsize=(12, 8))
corr = df_drop_pca_important.corr()
sns.heatmap(corr,
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True);
plt.suptitle('Heatmap, Correlation All Variables');
Will Check on Actual Taken Amount Only
age_from and age_to can be removed
df = pd.read_csv(data_folder + 'no-empty-data-only-actual-mortality_recom_added_group_data_june_9th_gender_based_data_after_processing.csv')
# df = df.drop([ 'age_from', 'age_to' ], axis=1)
df.head()---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-45-69caa1d1a88a> in <module>()
----> 1 df = pd.read_csv(data_folder + 'no-empty-data-only-actual-mortality_recom_added_group_data_june_9th_gender_based_data_after_processing.csv')
2 # df = df.drop([ 'age_from', 'age_to' ], axis=1)
3 df.head()
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, doublequote, delim_whitespace, low_memory, memory_map, float_precision)
676 skip_blank_lines=skip_blank_lines)
677
--> 678 return _read(filepath_or_buffer, kwds)
679
680 parser_f.__name__ = name
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
438
439 # Create the parser.
--> 440 parser = TextFileReader(filepath_or_buffer, **kwds)
441
442 if chunksize or iterator:
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds)
785 self.options['has_index_names'] = kwds['has_index_names']
786
--> 787 self._make_engine(self.engine)
788
789 def close(self):
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine)
1012 def _make_engine(self, engine='c'):
1013 if engine == 'c':
-> 1014 self._engine = CParserWrapper(self.f, **self.options)
1015 else:
1016 if engine == 'python':
C:\Program Files (x86)\Microsoft Visual Studio\Shared\Anaconda3_64\lib\site-packages\pandas\io\parsers.py in __init__(self, src, **kwds)
1706 kwds['usecols'] = self.usecols
1707
-> 1708 self._reader = parsers.TextReader(src, **kwds)
1709
1710 passed_names = self.names is None
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._setup_parser_source()
FileNotFoundError: File b'./data-for-code/no-empty-data-only-actual-mortality_recom_added_group_data_june_9th_gender_based_data_after_processing.csv' does not existdf.describe()#find correlation with price after standardization
df.corr()
age_from age_to Actual Vegetable Intake Actual Protein Intake Actual Grain Intake Actual Dairy Intake Actual Fruit intakes Actual Taken Sugars sweets and beverages amount Avg Fats oils and salad dressings taken ESRD patients: Total (or %) deaths for target year ESRD patients: Avg. Annual Mortality rates Dialysis patients: Total (or %) deaths for target year Dialysis patients: Avg. Annual Mortality rates 0.907545 0.907790 0.606775 0.040950 -0.452625 -0.408543 -0.372665 0.178760 -0.081677 1.000000 0.828148 0.999315 0.864052
ESRD patients: Avg. Annual Mortality rates 0.854841 0.854759 0.444925 -0.151035 -0.572187 -0.238108 -0.369639 0.022898 -0.202818 0.828148 1.000000 0.847294 0.995676 Dialysis patients: Total (or %) deaths for target year 0.914524 0.914723 0.603179 0.031982 -0.462540 -0.404490 -0.375241 0.172767 -0.087055 0.999315 0.847294 1.000000 0.880927 Dialysis patients: Avg. Annual Mortality rates 0.878659 0.879439 0.480531 -0.134234 -0.580723 -0.264096 -0.370520 0.050212 -0.208805 0.864052 0.995676 0.880927 1.000000
plt.figure(figsize=(12, 8))
corr = df.corr()
sns.heatmap(corr,
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True);
plt.suptitle('Heatmap, Correlation Actual Intake Amounts Only');# Without Age Groupsdf_without_ages = df.drop(['age_from', 'age_to', ' Gender'], axis=1)
df_without_ages.head()df_without_ages.corr()plt.figure(figsize=(12, 8))
corr = df_without_ages.corr()
sns.heatmap(corr,
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True);
plt.suptitle('Heatmap, Correlation Actual Intake Amounts Only');
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/univariate_food_groups_heatmaps_actual_intake_amount' + '.png')
Will use Normalization all data
df = pd.read_csv('no-empty-data-only-actual-mortality_recom_added_group_data_june_9th_gender_based_data_after_processing.csv')
df = df.drop([' Gender'], axis=1)
df.head()# normalize data
df_normalized = (df - df.mean())/ (df.max() - df.min())
df_normalized.head()
# df.head()df_normalized.corr()
age_from age_to Actual Vegetable Intake Actual Protein Intake Actual Grain Intake Actual Dairy Intake Actual Fruit intakes Actual Taken Sugars sweets and beverages amount Avg Fats oils and salad dressings taken ESRD patients: Total (or %) deaths for target year ESRD patients: Avg. Annual Mortality rates Dialysis patients: Total (or %) deaths for target year Dialysis patients: Avg. Annual Mortality rates ESRD patients: Total (or %) deaths for target year 0.907545 0.907790 0.606775 0.040950 -0.452625 -0.408543 -0.372665 0.178760 -0.081677 1.000000 0.828148 0.999315 0.864052 ESRD patients: Avg. Annual Mortality rates 0.854841 0.854759 0.444925 -0.151035 -0.572187 -0.238108 -0.369639 0.022898 -0.202818 0.828148 1.000000 0.847294 0.995676 Dialysis patients: Total (or %) deaths for target year 0.914524 0.914723 0.603179 0.031982 -0.462540 -0.404490 -0.375241 0.172767 -0.087055 0.999315 0.847294 1.000000 0.880927 Dialysis patients: Avg. Annual Mortality rates 0.878659 0.879439 0.480531 -0.134234 -0.580723 -0.264096 -0.370520 0.050212 -0.208805 0.864052 0.995676 0.880927 1.000000
plt.figure(figsize=(12, 8))
corr = df_normalized.corr()
sns.heatmap(corr,
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True);
plt.suptitle('Heatmap, Correlation Actual Intake Amounts Only');
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/univariate_food_groups_heatmaps_normalized_actual_intake_amount' + '.png')
Will check only on the difference from average recommended amount
df = pd.read_csv('copy-only-diff-no-empty-data-diff-recomm-mortality_recom_added_group_data_june_9th_gender_based_data_after_processing .csv')
df.head()df.describe()# find correlation with price after standardization
df.corr()
age_from age_to Actual Vegetable Intake Actual Protein Intake Actual Grain Intake Actual Dairy Intake Actual Fruit intakes Actual Taken Sugars sweets and beverages amount Avg Fats oils and salad dressings taken Diff Vegetable diff protein diff grain diff dairy diff fruit ESRD patients: Total (or %) deaths for target year ESRD patients: Avg. Annual Mortality rates Dialysis patients: Total (or %) deaths for target year Dialysis patients: Avg. Annual Mortality rates ESRD patients: Total (or %) deaths for target year
0.907545 0.907790 0.606775 0.040950 -0.452625 -0.408543 -0.372665 0.178760 -0.081677 0.426499 -0.273596 -0.268453 -0.364665 -0.488155 1.000000 0.828148 0.999315 0.864052
ESRD patients: Avg. Annual Mortality rates 0.854841 0.854759 0.444925 -0.151035 -0.572187 -0.238108 -0.369639 0.022898 -0.202818 0.359590 -0.266776 -0.263227 -0.213864 -0.392090 0.828148 1.000000 0.847294 0.995676
Dialysis patients: Total (or %) deaths for target year 0.914524 0.914723 0.603179 0.031982 -0.462540 -0.404490 -0.375241 0.172767 -0.087055 0.424340 -0.278817 -0.272938 -0.362140 -0.489873 0.999315 0.847294 1.000000 0.880927
Dialysis patients: Avg. Annual Mortality rates 0.878659 0.879439 0.480531 -0.134234 -0.580723 -0.264096 -0.370520 0.050212 -0.208805 0.401294 -0.244088 -0.246893 -0.229239 -0.387759 0.864052 0.995676 0.880927 1.000000
plt.figure(figsize=(12, 8))
corr = df.corr()
sns.heatmap(corr,
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True);
plt.suptitle('Heatmap, Correlation Actual Intake Variables, and Difference from Recommended Intake Variables');df.head()
df = df.drop([' Gender'], axis=1)
df.head()# normalize data
df_normalized = (df - df.mean())/ (df.max() - df.min())
df_normalized.head()
#df.head()df_normalized.corr()plt.figure(figsize=(12, 8))
corr = df_normalized.corr()
sns.heatmap(corr,
xticklabels = corr.columns.values,
yticklabels = corr.columns.values,
annot = True);
plt.suptitle('Heatmap, Correlation Actual Intake Variables, and Difference from Recommended Intake Variables');
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/univariate_food_groups_heatmaps_diff_intake_amount_normalized' + '.png')
Bivariate
The most important for Bivariate: bivariate_diff_norm.png and bivariate_diff_norm_rate_only.png. Difference in intake amounts from recommended, also normalized.
Bivariate plots on actual amount intake and target variables. will be saved in bivariate.png. The correlation pattern can be checked in the image saved
Correlation within the intake amounts do not say much as we are using actual intake amounts not differences with recomended amount. Also because, higher aged people will take higher amount. Also, normalized data will give true relation.
The correlation was shown above. Now linearity and non-linearity can be seen using the Bivariate plots.
The plots show similarity with correlation
import numpy as np
import pandas as pd
from IPython.display import display, HTML
from matplotlib import pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
%matplotlib inline
# ref in ref section
On Actual intake Amounts
df = pd.read_csv('no-empty-data-only-actual-mortality_recom_added_group_data_june_9th_gender_based_data_after_processing.csv')
#df_actual = df.drop([' Gender', 'age_from', 'age_to',' ESRD patients: Avg. Annual Mortality rates', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
#df_actual = df.drop([' Gender', 'age_from', 'age_to',' ESRD patients: Avg. Annual Mortality rates', ' ESRD patients: Total (or %) deaths for target year', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
df_actual = df.drop([' Gender', 'age_from', 'age_to'], axis=1)
df_actual.head()# on actual amounts
#plt.figure(figsize=(16, 300))
sns.pairplot(df_actual, vars=df_actual.columns, size=5, kind='reg'); # diag_kind='kde',
plt.title('Bivariate Plot, All Actual Taken Variables, Total ESRD target variable');
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/bivariate_food_group' + '.png')
plt.show()
On difference from recommended amount (data not normalized)
df = pd.read_csv('copy-only-diff-no-actual-no-empty-cell-diff-recomm-mortality_recom_added_group_data_june_9th_gender_based_data_after_processing .csv')
df.head()# On difference from recommended amount (data not normalized)
#df_actual_diff = df.drop(['age_from', 'age_to',' ESRD patients: Avg. Annual Mortality rates', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
#df_actual_diff.head()
#df_actual = df.drop([' Gender', 'age_from', 'age_to',' ESRD patients: Avg. Annual Mortality rates', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
#df_actual = df.drop([' Gender', 'age_from', 'age_to',' ESRD patients: Avg. Annual Mortality rates', ' ESRD patients: Total (or %) deaths for target year', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
df_actual_diff = df.drop(['age_from', 'age_to'], axis=1)
df_actual_diff.head()
#df.head()# On difference from recommended amount (data not normalized)
sns.pairplot(df_actual_diff, vars=df_actual_diff.columns, size=5, kind='reg');
#plt.title('Bivariate Plot, All Actual Taken Variables, Total ESRD target variable');
#plt.savefig('./saved_images_from_visualizations/' + 'line_' +indicator.replace(' ', '_')[0:10] + '_' + str(np.random.randint(0, 99999)) + '.png')
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/bivariate_diff' + '.png')
plt.show()
On difference from recommended amount (data/diff normalized)
df_normalized_diff = (df_actual_diff - df_actual_diff.mean())/ (df_actual_diff.max() - df_actual_diff.min())
df_normalized_diff.head()
sns.pairplot(df_normalized_diff, vars=df_normalized_diff.columns, size=5, kind='reg', aspect=1); # diag_kind='kde',
#plt.title('Bivariate Plot, All Actual Taken Variables, Total ESRD target variable');
#plt.savefig('./saved_images_from_visualizations/' + 'line_' +indicator.replace(' ', '_')[0:10] + '_' + str(np.random.randint(0, 99999)) + '.png')
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/bivariate_diff_norm' + '.png')
plt.show()df_normalized_diff = df_normalized_diff.drop([' ESRD patients: Total (or %) deaths for target year', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates'], axis=1)
df_normalized_diff.head()sns.pairplot(df_normalized_diff, vars=df_normalized_diff.columns, size=5, kind='reg', aspect=1); # diag_kind='kde',
plt.suptitle('Bivariate : Diff : Food Group: Normalized\n')
plt.ylabel('Difference in Intake amount from Recommended : Normalized')
plt.xlabel('Intakes')
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/bivariate_diff_norm_rate_only' + '.png')
plt.show()# can be ignored
"""
df_esrdonly = df.drop(['age_from', 'age_to',' ESRD patients: Avg. Annual Mortality rates', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
df_esrdonly.head()
plt.figure(figsize=(14, 14))
sns.pairplot(df_esrdonly, diag_kind='kde');
plt.xlabel('Bivariate Plot, Difference from Recommended Variables, Total ESRD target variable');
#plt.figure(figsize=(16, 16))
plt.rcParams['figure.figsize'] = 16, 16
pd.plotting.scatter_matrix(df_esrdonly, diagonal="kde")
plt.suptitle('Scatter Matrix');
plt.show();
"""
PCA
Applying PCA on Difference data
df_esrdonly = df.drop(['age_from', 'age_to',' ESRD patients: Total (or %) deaths for target year', ' Dialysis patients: Total (or %) deaths for target year', ' Dialysis patients: Avg. Annual Mortality rates' ], axis=1)
df_esrdonly.head()
#df_esrdonly.head()# steps followed from ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html
import sklearn
from sklearn import preprocessing
standardisedX = sklearn.preprocessing.scale(df_esrdonly)
standardisedX = pd.DataFrame(standardisedX, index=df_esrdonly.index, columns=df_esrdonly.columns)
standardisedX.apply(np.mean)standardisedX.apply(np.std)from sklearn import decomposition
pca = decomposition.PCA(n_components=2).fit(standardisedX)
pca = decomposition.PCA().fit(standardisedX)
pca#ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html
def pca_summary(pca, standardised_data, out=True):
names = ["PC"+str(i) for i in range(1, len(pca.explained_variance_ratio_)+1)]
a = list(np.std(pca.transform(standardised_data), axis=0))
b = list(pca.explained_variance_ratio_)
c = [np.sum(pca.explained_variance_ratio_[:i]) for i in range(1, len(pca.explained_variance_ratio_)+1)]
columns = pd.MultiIndex.from_tuples([("sdev", "Standard deviation"), ("varprop", "Proportion of Variance"), ("cumprop", "Cumulative Proportion")])
summary = pd.DataFrame( list(zip(a, b, c)), index=names, columns=columns)
if out:
print("Importance of components:")
display(summary)
return summarysummary = pca_summary(pca, standardisedX)# First two component can define over 95%np.sum(summary.sdev**2)# ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html
plt.rcParams['figure.figsize'] = 8, 8
def screeplot(pca, standardised_values):
y = np.std(pca.transform(standardised_values), axis=0)**2
x = np.arange(len(y)) + 1
plt.plot(x, y, "o-")
plt.xticks(x, ["Comp."+str(i) for i in x], rotation=60)
plt.ylabel("Variance")
plt.title('PCA variance in Components')
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_components_variance' + '.png')
plt.show()
screeplot(pca, standardisedX)
#plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_components_variance' + '.png')# comp 2 to comp 3 is the most change - slope
# first two or at best first 3 can be retainedsummary.sdev**2
Can be retained
PC1 4.263124 PC2 1.449435
pca.components_[0]np.sum(pca.components_[0]**2)# ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html
# not my code, I am using this as (similar to) a library function
def calcpc(variables, loadings):
# find the number of samples in the data set and the number of variables
numsamples, numvariables = variables.shape
# make a vector to store the component
pc = np.zeros(numsamples)
# calculate the value of the component for each sample
for i in range(numsamples):
valuei = 0
for j in range(numvariables):
valueij = variables.iloc[i, j]
loadingj = loadings[j]
valuei = valuei + (valueij * loadingj)
pc[i] = valuei
return pccalcpc(standardisedX, pca.components_[0])pca.transform(standardisedX)[:, 0]pca.components_[1]np.sum(pca.components_[1]**2)
highest loadings for
# for following code : Classes will/might be defined such as like High, low, neutral mortality in final workdf_esrdonly[' ESRD patients: Avg. Annual Mortality rates']
sorted(standardisedX[' ESRD patients: Avg. Annual Mortality rates'])# Define high and low mortalityy = df_esrdonly[' ESRD patients: Avg. Annual Mortality rates'] #< 0.5
# from normalized data. > 0.5 = high mortality
print('as the older age show higher mortality, does it mean th')
y = standardisedX[' ESRD patients: Avg. Annual Mortality rates'] > 0.5
y = standardisedX[' ESRD patients: Avg. Annual Mortality rates'] > 0.5
y# # ref: https://python-for-multivariate-analysis.readthedocs.io/a_little_book_of_python_for_multivariate_analysis.html
# not my code from the URL above, using this as a library function
def pca_scatter(pca, standardised_values, classifs):
foo = pca.transform(standardised_values)
bar = pd.DataFrame(list(zip(foo[:, 0], foo[:, 1], classifs)), columns=["PC1", "PC2", "Class"])
#plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_components_separating_high_low_mortality' + '.png')
sns.lmplot("PC1", "PC2", bar, hue="Class", fit_reg=False)
#y = df_esrdonly[' ESRD patients: Total (or %) deaths for target year']
#y = np.std(pca.transform(standardisedX), axis=0)**2
pca_scatter(pca, standardisedX, y)
# y can be used as classes like High, low, neutral mortality
# plt.suptitle('Mortality class and Principle components, y can be used as classes like High, low, neutral mortality');
plt.title('Only two components can separate the mortality data. True = High ')
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_components_separating_high_low_mortality' + '.png')# reference: https://towardsdatascience.com/dive-into-pca-principal-component-analysis-with-python-43ded13ead21
pca_components_cont = pca.components_[0:2]
plt.matshow(pca_components_cont, cmap='viridis')
plt.yticks([0,1],['1st Comp','2nd Comp'],fontsize=10)
plt.colorbar()
#plt.xticks(range(1, len(df_esrdonly.columns)),df_esrdonly.columns[1:len(df_esrdonly.columns)],rotation=65)
plt.xticks(range(len(df_esrdonly.columns)),df_esrdonly.columns,rotation=65,ha='left')
#plt.tight_layout()
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_food_groups_what_contributes_to_PCA_components' + '.png')
plt.show()## from the abobe plot, Vegetable, Grain, Protein contribute the most to the 1st component
How the intake differences contributed for the affecting PCA components
diffs = list(df_esrdonly.columns[:-1])
import seaborn as sns
s = sns.heatmap(df_esrdonly[diffs].corr(),cmap='coolwarm')
s.set_yticklabels(s.get_yticklabels(),rotation=30,fontsize=7)
s.set_xticklabels(s.get_xticklabels(),rotation=30,fontsize=7)
plt.savefig('../../progress_reports/to_submit/pca_univariate_bivariate/pca_food_groups_how_in_together_influencing_PCA_components' + '.png')
plt.show()
Vegetable by itself is important — contributing factors
protein + grain + then fruit
grain with fruit — slighly with dairy
dairy with protein and grain*
fruit with protein and grain then dairy
References
https://www.kaggle.com/etakla/exploring-the-dataset-univariate-analysis https://www.kaggle.com/etakla/exploring-the-dataset-bivariate-analysis https://towardsdatascience.com/survival-analysis-part-a-70213df21c2e https://lifelines.readthedocs.io/en/latest/ https://lifelines.readthedocs.io/en/latest/Survival%20Regression.html https://www.statsdirect.com/help/survival_analysis/cox_regression.htm https://courses.lumenlearning.com/suny-natural-resources-biometrics/chapter/chapter-7-correlation-and-simple-linear-regression/ https://www.kaggle.com/residentmario/univariate-plotting-with-pandas
#ref https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html #https://www.shanelynn.ie/using-pandas-dataframe-creating-editing-viewing-data-in-python/
https://www.oreilly.com/library/view/practical-statistics-for/9781491952955/ch04.html
https://erc.barnard.edu/spss/pearsons_r
https://www.statpac.com/statistics-calculator/correlation-regression.htm
https://data-flair.training/blogs/python-statistics/
https://www.analyticsvidhya.com/blog/2016/01/guide-data-exploration/
References:
Number of columns to display
https://stackoverflow.com/questions/11707586/how-do-i-expand-the-output-display-to-see-more-columns
Misc to help with Regression
#folder = './data-for-code/' #'C:/Users/Sayed Ahmed/mrp_project_implementation/phase methodology and experiments/excel-xlstat-analysis/acr-data-for-analysis/food-groups/' #/regression_consolidated-food-group-based-data-with-acr-and-blood-pressure-2015-2016.xlsx'
#os.listdir(folder)file = data_folder + 'regression_consolidated-food-group-based-data-with-acr-and-blood-pressure-2015-2016.xlsx'
#excel_file = pd.ExcelFile(folder + 'regression_consolidated-food-group-based-data-with-acr-and-blood-pressure-2015-2016.xlsx')
fileexcel_file = pd.ExcelFile(file)
# see all sheet names
sheet_names = excel_file.sheet_names
sheet_names#[:4]pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 10000)
foodgroup_acr_data_df = excel_file.parse('consolidated-food-group-acr')
foodgroup_acr_data_df.head(3)df_food_group_only = foodgroup_acr_data_df[ ['participant_id', 'food_group_name', 'avg_food_weight_in_gms'] ]
df_food_group_only.head()df_food_group_only.Tdf_food_group_only_index = df_food_group_only.set_index(['participant_id', 'food_group_name'])
df_food_group_only_indexdf_food_group_only_index_unstack = df_food_group_only_index.unstack()df_food_group_only_index_unstack.head()foodgroup_acr_data_df.columnsdf_without_food_groups = foodgroup_acr_data_df.drop(['food_group_name', 'avg_food_weight_in_gms'], axis=1)
df_without_food_groups.head()df_without_food_groups_one_entry = df_without_food_groups.groupby(['participant_id']).mean()merged = df_food_group_only_index_unstack.merge(df_without_food_groups_one_entry, how='inner', left_on='participant_id', right_on='participant_id', left_index=True)
merged.head()
merged.shapemerged.head()foodgroup_acr_data_df_t = foodgroup_acr_data_df.T
foodgroup_acr_data_df_t.head(10)df_multi = foodgroup_acr_data_df.set_index(['participant_id', 'food_group_name'])
df_multidf_multi = df_multi.unstack()df_multi.head()df_some = foodgroup_acr_data_df[['participant_id', 'participant_age']]
df_some.head()df_some.merge(df_multi, how='left', left_on='participant_id', right_on='participant_id')Ref: https://hackernoon.com/reshaping-data-in-python-fa27dda2ff77# backup
#folder = './data-for-code/' #'C:/Users/Sayed Ahmed/mrp_project_implementation/phase methodology and experiments/excel-xlstat-analysis/acr-data-for-analysis/food-groups/' #/regression_consolidated-food-group-based-data-with-acr-and-blood-pressure-2015-2016.xlsx'
#os.listdir(folder)
file = data_folder + 'regression_consolidated-food-group-based-data-with-acr-and-blood-pressure-2015-2016.xlsx'
#excel_file = pd.ExcelFile(folder + 'regression_consolidated-food-group-based-data-with-acr-and-blood-pressure-2015-2016.xlsx')
file
excel_file = pd.ExcelFile(file)
# see all sheet names
sheet_names = excel_file.sheet_names
sheet_names#[:4]
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 10000)
foodgroup_acr_data_df = excel_file.parse('consolidated-food-group-acr')
foodgroup_acr_data_df.head(3)
df_food_group_only = foodgroup_acr_data_df[ ['participant_id', 'food_group_name', 'avg_food_weight_in_gms'] ]
df_food_group_only.head()
df_food_group_only.T
df_food_group_only_index = df_food_group_only.set_index(['participant_id', 'food_group_name'])
df_food_group_only_index
df_food_group_only_index_unstack = df_food_group_only_index.unstack()
df_food_group_only_index_unstack.head()
foodgroup_acr_data_df.columns
df_without_food_groups = foodgroup_acr_data_df.drop(['food_group_name', 'avg_food_weight_in_gms'], axis=1)
df_without_food_groups.head()
df_without_food_groups_one_entry = df_without_food_groups.groupby(['participant_id']).mean()
merged = df_food_group_only_index_unstack.merge(df_without_food_groups_one_entry, how='inner', left_on='participant_id', right_on='participant_id', left_index=True)
merged.head()
merged.shape
merged.head()
merged.to_csv(data_folder + 'food_group_data_ready_for_excel_regression.csv')
foodgroup_acr_data_df_t = foodgroup_acr_data_df.T
foodgroup_acr_data_df_t.head(10)https://hackernoon.com/reshaping-data-in-python-fa27dda2ff77
https://pandas.pydata.org/pandas-docs/stable/reference/frame.html