Data Preprocessing- Sentiment Analysis of Electrodermal Activity Data for Personalized Interventions in Mental Health

7 min readMay 10, 2023
# Import libraries
import numpy as np
import pandas as pd
import pickle
import os
from scipy import signal # For filtering
from sklearn import preprocessing # For normalizing

# Define constants
DATA_PATH = "WESAD/" # Path to the dataset folder
SUBJECTS = ["S2", "S3", "S4", "S5", "S6", "S7", "S8", "S9", "S10", "S11", "S13", "S14", "S15", "S16", "S17"] # List of subjects
DEVICE = "Empatica_E4" # Device name
MODALITIES = ["ACC", "BVP", "EDA", "TEMP"] # Sensor modalities
SRATE = {"ACC": 32, "BVP": 64, "EDA": 4, "TEMP": 4} # Sampling rates
LABELS = {1: "baseline", 2: "stress", 3: "amusement"} # Labels for affective states

# Define functions
def load_subject_data(subject):
    """
    Load the data of a single subject from a pickle file.
    Input: subject - string, subject ID (e.g. "S2")
    Output: data - dictionary, contains labels and sensor data
    """
    file_path = os.path.join(DATA_PATH, subject, subject + ".pkl")
    with open(file_path, "rb") as file:
        data = pickle.load(file, encoding="latin1")
    return data

def resample(data, srate_in, srate_out):
    """
    Resample the data to a different sampling rate using linear interpolation.
    Input: data - array, original data
           srate_in - int, original sampling rate
           srate_out - int, desired sampling rate
    Output: data_resampled - array, resampled data
    """
    n_in = len(data)
    n_out = int(n_in * srate_out / srate_in)
    data_resampled = np.interp(np.linspace(0, n_in, n_out), np.arange(n_in), data)
    return data_resampled

def filter(data, srate, lowcut=None, highcut=None):
    """
    Filter the data using a Butterworth filter with specified cutoff frequencies.
    Input: data - array, original data
           srate - int, sampling rate of the data
           lowcut - float or None, low cutoff frequency (Hz) or None for lowpass filter
           highcut - float or None, high cutoff frequency (Hz) or None for highpass filter
    Output: data_filtered - array, filtered data
    """
    
    # Check the cutoff frequencies and determine the filter type and order
    if lowcut is not None and highcut is not None:
        filter_type = 'bandpass'
        order = 4 # You can change the order as needed
        nyq = 0.5 * srate # Nyquist frequency
        low = lowcut / nyq # Normalize the low cutoff frequency
        high = highcut / nyq # Normalize the high cutoff frequency
        cutoff = [low, high] # Cutoff frequency for bandpass filter
        
    elif lowcut is not None and highcut is None:
        filter_type = 'highpass'
        order = 2 # You can change the order as needed
        nyq = 0.5 * srate # Nyquist frequency
        cutoff = lowcut / nyq # Cutoff frequency for highpass filter
        
    elif lowcut is None and highcut is not None:
        filter_type = 'lowpass'
        order = 2 # You can change the order as needed
        nyq = 0.5 * srate # Nyquist frequency
        cutoff = highcut / nyq # Cutoff frequency for lowpass filter
        
    else:
        raise ValueError("Invalid cutoff frequencies")

    # Design the Butterworth filter using scipy.signal.butter function
    b, a = signal.butter(order, cutoff, btype=filter_type)
# Apply the filter to the data using scipy.signal.filtfilt function
    data_filtered = signal.filtfilt(b, a, data)

    return data_filtered

def normalize(data):
    """
    Normalize the data to have zero mean and unit variance using sklearn.preprocessing.scale function.
    Input: data - array, original data
    Output: data_normalized - array, normalized data
    """
    data_normalized = preprocessing.scale(data)
    return data_normalized

def smooth(data, window_size):
    """
    Smooth the data using a moving average filter with specified window size.
    Input: data - array, original data
           window_size - int, size of the moving window
    Output: data_smoothed - array, smoothed data
    """
    # Create a one-dimensional window with uniform weights using numpy.ones function
    window = np.ones(window_size) / window_size

    # Apply the moving average filter to the data using numpy.convolve function
    # Use 'same' mode to preserve the original length of the data
    data_smoothed = np.convolve(data, window, mode='same')

    return data_smoothed

def extract_features(data):
    """
    Extract features from the raw sensor data.
    Input: data - array, raw sensor data
    Output: features - array, extracted features
    """
    # For this example, we only use the mean and standard deviation as features
    # You can add more features such as min, max, median, skewness, kurtosis, etc.
    mean = np.mean(data)
    std = np.std(data)
    features = np.array([mean, std])
    return features

def preprocess_subject_data(data):
    """
    Preprocess the data of a single subject by resampling, filtering, normalizing, smoothing, segmenting and feature extraction.
    Input: data - dictionary, contains labels and sensor data
    Output: X - array, feature matrix (shape: n_segments x n_features)
            y - array, label vector (shape: n_segments)
            srates - dictionary, contains the resampled sampling rates for each modality
            segments - dictionary, contains the segment start and end times for each modality
            features - list, contains the feature names for each modality
    """
    
    # Resample the label data to 4 Hz (the lowest sampling rate among the sensors)
    label = resample(data["label"], 700, 4)

    # Initialize empty lists to store the feature matrix and label vector
    X = []
    y = []

    # Initialize a dictionary to store the resampled sampling rates for each modality
    srates = {}

    # Initialize a dictionary to store the segment start and end times for each modality
    segments = {}

    # Initialize a list to store the feature names for each modality
    features = []

    # Loop through each sensor modality
    for modality in MODALITIES:

        # Resample the sensor data to 4 Hz
        sensor_data = resample(data[DEVICE][modality], SRATE[modality], 4)

        # Store the resampled sampling rate
        srates[modality] = 4

        # Filter the sensor data using appropriate cutoff frequencies
        # For this example, I used some arbitrary values based on common sense and literature review
        # You can change them as needed or use different filters for different modalities
        if modality == "ACC":
            # Use a bandpass filter with low cutoff frequency of 0.5 Hz and high cutoff frequency of 10 Hz
            sensor_data_filtered = filter(sensor_data, srates[modality], lowcut=0.5, highcut=10)
        elif modality == "BVP":
            # Use a bandpass filter with low cutoff frequency of 0.5 Hz and high cutoff frequency of 20 Hz
            sensor_data_filtered = filter(sensor_data, srates[modality], lowcut=0.5, highcut=20)
        elif modality == "EDA":
            # Use a lowpass filter with high cutoff frequency of 5 Hz
            sensor_data_filtered = filter(sensor_data, srates[modality], highcut=5)
        elif modality == "TEMP":
            # Use a lowpass filter with high cutoff frequency of 0.1 Hz
            sensor_data_filtered = filter(sensor_data, srates[modality], highcut=0.1)

        # Normalize the sensor data to have zero mean and unit variance
        sensor_data_normalized = normalize(sensor_data_filtered)

        # Smooth the sensor data using a moving average filter with specified window size
        # For this example, I used a window size of 5 samples for each modality
        # You can change it as needed or use different window sizes for different modalities
        window_size = 5
        sensor_data_smoothed = smooth(sensor_data_normalized, window_size)

        # Segment the sensor data into non-overlapping windows of 60 seconds (240 samples)
        n_samples = len(sensor_data_smoothed)
        window_size = 240
        n_segments = n_samples // window_size
        sensor_data_segmented = np.array_split(sensor_data_smoothed[:n_segments * window_size], n_segments)

        # Segment the label data accordingly
        label_segmented = np.array_split(label[:n_segments * window_size], n_segments)

        # Store the segment start and end times
        segments[modality] = []
        for i in range(n_segments):
            start_time = i * window_size / srates[modality]
            end_time = (i + 1) * window_size / srates[modality]
            segments[modality].append((start_time, end_time))

        # Loop through each segment
        for segment in sensor_data_segmented:

            # Extract features from the segment
            segment_features = extract_features(segment)

            # Append the features to the feature matrix
            X.append(segment_features)

            # Append the feature names to the feature list
            features.extend([modality + "_" + feature for feature in ["mean", "std"]])

        # Loop through each label segment
        for segment in label_segmented:

            # Take the most frequent label as the segment label
            segment_label = np.bincount(segment).argmax()

            # Append the label to the label vector
            y.append(segment_label)

    # Convert the feature matrix and label vector to numpy arrays
    X = np.array(X)
    y = np.array(y)

    return X, y, srates, segments, features

# Preprocess the data of all subjects and save them as csv files
for subject in SUBJECTS:
    print("Preprocessing data for subject", subject)
    data = load_subject_data(subject)
    X, y, srates, segments, features = preprocess_subject_data(data)
    df = pd.DataFrame(X, columns=features)
    df["label"] = y
    df["subject"] = subject
    df.to_csv(DATA_PATH + subject + "_preprocessed.csv", index=False)
print("Done!")
Received message. Okay, I will complete the code. Here is the rest of the code: ```python sensor_data_filtered = filter(sensor_data, srates[modality], lowcut=0.5, highcut=20) elif modality == "EDA": # Use a lowpass filter with high cutoff frequency of 5 Hz sensor_data_filtered = filter(sensor_data, srates[modality], highcut=5) elif modality == "TEMP": # Use a lowpass filter with high cutoff frequency of 0.1 Hz sensor_data_filtered = filter(sensor_data, srates[modality], highcut=0.1) # Normalize the sensor data to have zero mean and unit variance sensor_data_normalized = normalize(sensor_data_filtered) # Smooth the sensor data using a moving average filter with specified window size # For this example, I used a window size of 5 samples for each modality # You can change it as needed or use different window sizes for different modalities window_size = 5 sensor_data_smoothed = smooth(sensor_data_normalized, window_size) # Segment the sensor data into non-overlapping windows of 60 seconds (240 samples) n_samples = len(sensor_data_smoothed) window_size = 240 n_segments = n_samples // window_size sensor_data_segmented = np.array_split(sensor_data_smoothed[:n_segments * window_size], n_segments) # Segment the label data accordingly label_segmented = np.array_split(label[:n_segments * window_size], n_segments) # Store the segment start and end times segments[modality] = [] for i in range(n_segments): start_time = i * window_size / srates[modality] end_time = (i + 1) * window_size / srates[modality] segments[modality].append((start_time, end_time)) # Loop through each segment for segment in sensor_data_segmented: # Extract features from the segment segment_features = extract_features(segment) # Append the features to the feature matrix X.append(segment_features) # Append the feature names to the feature list features.extend([modality + "_" + feature for feature in ["mean", "std"]]) # Loop through each label segment for segment in label_segmented: # Take the most frequent label as the segment label segment_label = np.bincount(segment).argmax() # Append the label to the label vector y.append(segment_label) # Convert the feature matrix and label vector to numpy arrays X = np.array(X) y = np.array(y) return X, y, srates, segments, features # Preprocess the data of all subjects and save them as csv files for subject in SUBJECTS: print("Preprocessing data for subject", subject) data = load_subject_data(subject) X, y, srates, segments, features = preprocess_subject_data(data) df = pd.DataFrame(X, columns=features) df["label"] = y df["subject"] = subject df.to_csv(DATA_PATH + subject + "_preprocessed.csv", index=False) print("Done!") ```
Data Preprocessing- Sentiment Analysis of Electrodermal Activity Data for Personalized Interventions in Mental Health

Written by Sarthak Increase