"""
Module for ensuring the validity of uploaded CSV files.
"""
import pandas as pd
import numpy as np
[docs]def check_mpt_data(df, expected_columns):
"""
Checks that a pandas DataFrame has at least one row of data and contains specific columns.
Parameters
-----------
df : pandas.DataFrame
The DataFrame to check.
expected_columns : list
A list of column names that the DataFrame is expected to have.
Returns
--------
columns_present, has_data: bool
True if the DataFrame contains at least one row of data and all of the expected columns, False otherwise.
"""
# Check that all of the expected columns are present
columns_present = all(col in df.columns for col in expected_columns)
# Check that the DataFrame has at least one row of data
has_data = not df.empty
# Return True if both the expected columns and data are present
return columns_present and has_data
[docs]def clean_mpt_data(df, features_to_keep='default', target_column=None):
"""
Cleans a pandas DataFrame containing MPT data.
Parameters
-----------
df : pandas.DataFrame
The DataFrame to clean.
Returns
--------
df: pandas.DataFrame
The cleaned DataFrame.
"""
default_feature_list = [
'alpha', # Fitted anomalous diffusion alpha exponenet
'D_fit', # Fitted anomalous diffusion coefficient
'kurtosis', # Kurtosis of track
'asymmetry1', # Asymmetry of trajecory (0 for circular symmetric, 1 for linear)
'asymmetry2', # Ratio of the smaller to larger principal radius of gyration
'asymmetry3', # An asymmetric feature that accnts for non-cylindrically symmetric pt distributions
'AR', # Aspect ratio of long and short side of trajectory's minimum bounding rectangle
'elongation', # Est. of amount of extension of trajectory from centroid
'boundedness', # How much a particle with Deff is restricted by a circular confinement of radius r
'fractal_dim', # Measure of how complicated a self similar figure is
'trappedness', # Probability that a particle with Deff is trapped in a region
'efficiency', # Ratio of squared net displacement to the sum of squared step lengths
'straightness', # Ratio of net displacement to the sum of squared step lengths
'MSD_ratio', # MSD ratio of the track
'Deff1', # Effective diffusion coefficient at 0.33 s
'Deff2', # Effective diffusion coefficient at 3.3 s
'Mean alpha',
'Mean D_fit',
'Mean kurtosis',
'Mean asymmetry1',
'Mean asymmetry2',
'Mean asymmetry3',
'Mean AR',
'Mean elongation',
'Mean boundedness',
'Mean fractal_dim',
'Mean trappedness',
'Mean efficiency',
'Mean straightness',
'Mean MSD_ratio',
'Mean Deff1',
'Mean Deff2',
]
if target_column is not None:
assert target_column in df.columns, "Target column not in DataFrame"
assert df[target_column].notna().all(), "Target column contains NaN values"
if features_to_keep == 'default' and target_column is None: #user wants all default features
df = df[default_feature_list]
df = df[~df[list(set(default_feature_list)-set(['Deff2', 'Mean Deff2']))].isin([np.inf, np.nan, -np.inf]).any(axis=1)]
elif features_to_keep == 'default' and target_column is not None: #user wants all default features and target column
df = df[default_feature_list + [target_column]]
df = df[~df[list(set(default_feature_list)-set(['Deff2', 'Mean Deff2']))].isin([np.inf, np.nan, -np.inf]).any(axis=1)]
elif features_to_keep != 'default' and target_column is None: #user wants specific features
df = df[features_to_keep]
df = df[~df[list(set(features_to_keep)-set(['Deff2', 'Mean Deff2']))].isin([np.inf, np.nan, -np.inf]).any(axis=1)]
else:
df = df[features_to_keep + [target_column]] #user wants specific features and target column
df = df[~df[list(set(features_to_keep)-set(['Deff2', 'Mean Deff2']))].isin([np.inf, np.nan, -np.inf]).any(axis=1)]
df = df.fillna(0) #setting any Deff2, Mean Deff2, to 0
# This may also fill NA target columns with 0, which may not be desired
return df