Source code for protpy.autocorrelation

################################################################################
#############                   Autocorrelation                   ##############
################################################################################

from __future__ import annotations

import numpy as np
import pandas as pd
import math
from aaindex import aaindex1

"""
References
==========
[1] B. Hollas, “An analysis of the autocorrelation descriptor for molecules,”
    J. Math. Chem., vol. 33, no. 2, pp. 91–101, 2003.

[2] S. A. K. Ong, H. H. Lin, Y. Z. Chen, Z. R. Li, and Z. Cao, “Efficacy
    of different protein descriptors in predicting protein functional families,”
    BMC Bioinformatics, vol. 8, p. 300, 2007.

[3] D. Raimondi, G. Orlando, W. F. Vranken, and Y. Moreau,
    “Exploring the limitations of biophysical propensity scales coupled with
    machine learning for protein sequence analysis,” Sci. Rep., vol. 9, no. 1, p. 16932, 2019.  
"""

#list of amino acids
from ._constants import amino_acids, _validate_sequence
    
############################### MoreauBroto Autocorrelation ###############################

[docs] def moreaubroto_autocorrelation(sequence: str, lag: int = 30, properties: list[str] = ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], normalize: bool = True) -> pd.DataFrame: """ Calculate MoreauBrotoAuto Autocorrelation (MBAuto) descriptor for sequence. Autocorrelation descriptors are a class of topological descriptors, also known as molecular connectivity indices, that describe the level of correlation between two objects (protein or peptide sequences) in terms of their specific structural or physicochemical properties, which are defined based on the distribution of amino acid properties along the sequence. By default, 8 amino acid properties are used for deriving the descriptors. The derivations and detailed explanations of this type of descriptor is outlined in [1]. The MBAuto descriptor is a type of Autocorrelation descriptor that uses the property values as the basis for measurement [2]. Each autocorrelation will generate the number of features depending on the lag value and number of properties input with total features = lag * number of properties. The output autocorrelation can also be normalized by setting the normalize parameter to true, this occurs by default. Using the default 8 properties with default lag value of 30, 240 features are generated, the default 8 properties are: AccNo. CIDH920105 - Normalized Average Hydrophobicity Scales AccNo. BHAR880101 - Average Flexibility Indices AccNo. CHAM820101 - Polarizability Parameter AccNo. CHAM820102 - Free Energy of Solution in Water, kcal/mole AccNo. CHOC760101 - Residue Accessible Surface Area in Tripeptide AccNo. BIGC670101 - Residue Volume AccNo. CHAM810101 - Steric Parameter AccNo. DAYM780201 - Relative Mutability Parameters ========== :sequence: str protein sequence. :lag: int (default=30) A value for a lag, the max value is equal to the length of the shortest peptide minus one. :properties: list list of AAI index record codes/accession numbers for the physicochemical properties to use in the calculation. Defaults to the 8 standard AAIndex properties listed above. :normalize: bool (default=True) rescale/normalize MoreauBroto Autocorrelation values into range of 0-1. Returns ======= :moreaubroto_autocorrelation_df: pd.Dataframe pandas Dataframe of MBAuto values for protein sequence. Output will be of the shape 1 x N, where N is the number of features calculated from the descriptor and 1 is the input sequence. By default, the shape will be 1 x 240 (30 features per property - using 8 properties and lag=30). """ #validate input protein sequence sequence = _validate_sequence(sequence) #validate lag, set default lag if invalid value input if (lag>=len(sequence) or (lag<0) or not (isinstance(lag, int))): lag = 30 #validate that at least 1 property input to function if (properties == "" or properties == []): raise ValueError('At least one property must be passed into function.') #if 1 property passed into function, wrap it to an iterable list if (isinstance(properties, str)): properties = [properties] #initialise dicts to store AAI properties and values aai_properties = {} for prop in properties: #raise value error if accession number not found in aaindex1 if not (prop in aaindex1.record_codes()): raise ValueError(f"Property {prop} not found in list of available properties in the aaindex1:\n{properties}.") aai_properties[prop] = {} #iterate through list of properties, getting property values for accession numbers from AAIndex1 for prop in list(aai_properties.keys()): #get property values from AAIndex aaindex1[prop].values.pop('-', None) prop_amino_acid_values = aaindex1[prop].values #normalise property values if applicable, calculate mean and std dev aai_property_vals = {} if (normalize): for i, j in prop_amino_acid_values.items(): aai_property_vals[i] = (j - (sum(prop_amino_acid_values.values()) / len(prop_amino_acid_values.values()))) / _std(prop_amino_acid_values.values(), ddof=0) else: #use raw (unnormalized) property values for i, j in prop_amino_acid_values.items(): aai_property_vals[i] = j aa_counter = 0 #assign property and associated amino acid values to aai_property_vals array for i, j in aai_property_vals.items(): aai_property_vals[amino_acids[aa_counter]] = aai_property_vals[i] aa_counter+=1 aai_properties[prop] = aai_property_vals #store resultant autocorrelation values moreaubroto_autocorrelation = {} #iterate through list of properties, calculating autocorrelation values for the sequence, append to results dict for key in aai_properties: temp = 0 for i in range(1, lag+1): for j in range(len(sequence)-i): temp = temp + aai_properties[key][sequence[j]] * aai_properties[key][sequence[j+1]] if (len(sequence) - i == 0): moreaubroto_autocorrelation["MBAuto_" + key + "_" + str(i)] = round( temp / (len(sequence)), 3) else: moreaubroto_autocorrelation["MBAuto_" + key + "_" + str(i)] = round( temp / (len(sequence) - i), 3) #transform values and columns to DataFrame moreaubroto_autocorrelation_df = pd.DataFrame([list(moreaubroto_autocorrelation.values())], columns=list(moreaubroto_autocorrelation.keys())) return moreaubroto_autocorrelation_df
################################## Moran Autocorrelation ##################################
[docs] def moran_autocorrelation(sequence: str, lag: int = 30, properties: list[str] = ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], normalize: bool = True) -> pd.DataFrame: """ Refer to MBAuto docstring for autocorrelation explanation. Moran autocorrelation (MAuto) utilizes property deviations from the average values [2]. Parameters ========== :sequence: str protein sequence. :lag: int (default=30) A value for a lag, the max value is equal to the length of the shortest peptide minus one. :properties: list list of AAI index record codes/accession numbers for the physicochemical properties to use in the calculation. Defaults to the same 8 standard AAIndex properties as MBAuto. :normalize: bool (default=True) rescale/normalize MoreauBroto Autocorrelation values into range of 0-1. Returns ======= :moran_autocorrelation_df: pd.DataFrame pandas Dataframe of MAuto values for protein sequence. Output will be of the shape 1 x N, where N is the number of features calculated from the descriptor and 1 is the input sequence. By default, the shape will be 1 x 240 (30 features per property - using 8 properties and lag=30). """ #validate input protein sequence sequence = _validate_sequence(sequence) #validate lag, set default lag if invalid value input if (lag>=len(sequence) or (lag<0) or not (isinstance(lag, int))): lag = 30 #validate at least 1 property input to function if (properties == "" or properties == []): raise ValueError('At least one property must be passed into function.') #if 1 property passed into function, wrap it to an iterable list if (isinstance(properties, str)): properties = [properties] #initialise dicts to store AAI properties and values aai_properties = {} for prop in properties: #raise value error if accession number not found in aaindex1 if not (prop in aaindex1.record_codes()): raise ValueError(f"Property {prop} not found in list of available properties in the aaindex1:\n{properties}.") aai_properties[prop] = {} #iterate through list of properties, getting property values for accession numbers from AAIndex1 for prop in list(aai_properties.keys()): #get property values from AAIndex aaindex1[prop].values.pop('-', None) prop_aminoacid_values = aaindex1[prop].values aai_property_vals = {} #normalise property values, calculate mean and std dev if (normalize): for i, j in prop_aminoacid_values.items(): aai_property_vals[i] = (j - (sum(prop_aminoacid_values.values()) / len(prop_aminoacid_values.values()))) / _std(prop_aminoacid_values.values(), ddof=0) else: #use raw (unnormalized) property values for i, j in prop_aminoacid_values.items(): aai_property_vals[i] = j aa_counter = 0 #assign property and associated amino acid values to aai_property_vals array for i, j in aai_property_vals.items(): aai_property_vals[amino_acids[aa_counter]] = aai_property_vals[i] aa_counter+=1 aai_properties[prop] = aai_property_vals #store resultant autocorrelation values moran_autocorrelation = {} #iterate through list of properties, calculating autocorrelation values for the sequence, append to results dict for key in aai_properties: cds = 0 for aa in amino_acids: cds = cds + sequence.count(aa) * aai_properties[key][aa] prop_mean = cds / len(sequence) cc = [] for aa in sequence: cc.append(aai_properties[key][aa]) k = (_std(cc, ddof=0)) ** 2 for i in range(1, lag+1): temp = 0 for j in range(len(sequence) - i): temp = temp + (aai_properties[key][sequence[j]] - prop_mean) * ( aai_properties[key][sequence[j+i]] - prop_mean) if len(sequence) - i == 0: moran_autocorrelation["MAuto_" + key + "_" + str(i)] = round( temp / ((len(sequence)) / k), 5) else: moran_autocorrelation["MAuto_" + key + "_" + str(i)] = round( temp / ((len(sequence) - i) / k), 5) #transform values and columns to DataFrame moran_autocorrelation_df = pd.DataFrame([list(moran_autocorrelation.values())], columns=list(moran_autocorrelation.keys())) return moran_autocorrelation_df
################################## Geary Autocorrelation ##################################
[docs] def geary_autocorrelation(sequence: str, lag: int = 30, properties: list[str] = ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102", "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], normalize: bool = True) -> pd.DataFrame: """ Refer to MBAuto docstring for autocorrelation description. Geary Autocorrelation (GAuto) utilizes the square-difference of property values instead of vector-products (of property values or deviations) [2]. Parameters ========== :sequence: str protein sequence. :lag: int (default=30) A value for a lag, the max value is equal to the length of the shortest peptide minus one. :properties: list list of AAI index record codes/accession numbers for the physicochemical properties to use in the calculation. Defaults to the same 8 standard AAIndex properties as MBAuto. :normalize: bool (default=True) rescale/normalize MoreauBroto Autocorrelation values into range of 0-1. Returns ======= :geary_autocorrelation_df: pd.DataFrame pandas Dataframe of MAuto values for protein sequence. Output will be of the shape 1 x N, where N is the number of features calculated from the descriptor and 1 is the input sequence. By default, the shape will be 1 x 240 (30 features per property - using 8 properties and lag=30). """ #validate input protein sequence sequence = _validate_sequence(sequence) #validate lag, set default lag if invalid value input if (lag>=len(sequence) or (lag<0) or not (isinstance(lag, int))): lag = 30 #validate at least 1 property input to function if (properties == "" or properties == []): raise ValueError('At least one property must be passed into function.') #if 1 property passed into function, wrap it to an iterable list if (isinstance(properties, str)): properties = [properties] #initialise dicts to store AAI properties and values aai_properties = {} for prop in properties: #raise value error if accession number not found in aaindex1 if not (prop in aaindex1.record_codes()): raise ValueError(f"Property {prop} not found in list of available properties in the aaindex1:\n{properties}.") aai_properties[prop] = {} #iterate through list of properties, getting property values for accession numbers from AAIndex1 for prop in list(aai_properties.keys()): #get property values from AAIndex & reshape aaindex1[prop].values.pop('-', None) prop_aminoacid_values = aaindex1[prop].values aai_property_vals = {} #normalise property values, calculate mean and std dev if (normalize): for i, j in prop_aminoacid_values.items(): aai_property_vals[i] = (j - (sum(prop_aminoacid_values.values()) / len(prop_aminoacid_values.values()))) / _std(prop_aminoacid_values.values(), ddof=0) else: #use raw (unnormalized) property values for i, j in prop_aminoacid_values.items(): aai_property_vals[i] = j aa_counter = 0 #assign property and associated amino acid values to aai_property_vals array for i, j in aai_property_vals.items(): aai_property_vals[amino_acids[aa_counter]] = aai_property_vals[i] aa_counter+=1 aai_properties[prop] = aai_property_vals #store resultant autocorrelation values geary_autocorrelation = {} #iterate through list of properties, calculating autocorrelation values for the sequence, append to results dict for key in aai_properties: cc = [] for aa in sequence: cc.append(aai_properties[key][aa]) k = ((np.std(cc, ddof=0)) ** 2) * len(sequence) / (len(sequence) - 1) for i in range(1, lag+1): temp = 0 for j in range(len(sequence) - i): temp = (temp + ( aai_properties[key][sequence[j]] - aai_properties[key][sequence[j+i]]) ** 2) if len(sequence) - i == 0: geary_autocorrelation["GAuto_" + key + "_" + str(i)] = round( temp / (2* (len(sequence))) / k, 3) else: geary_autocorrelation["GAuto_" + key + "_" + str(i)] = round( temp / (2* (len(sequence) -i)) / k, 3) #transform values and columns to DataFrame geary_autocorrelation_df = pd.DataFrame([list(geary_autocorrelation.values())], columns=list(geary_autocorrelation.keys())) return geary_autocorrelation_df
def _std(array: list[float], ddof: int = 1) -> float: """ Calculate the standard deviation of the array data, with associated means Delta Degrees of Freedom (ddof). Parameters ========== :array: np.array numpy array of float values. :ddof: int (default=1) means delta degrees of freedom. Returns ======= :result: np.array input array after standard deviation transformation. """ return math.sqrt(sum([math.pow(i - sum(array) / len(array), 2) for i in array]) / (len(array) - ddof))