################################################################################
############# Autocorrelation ##############
################################################################################
from __future__ import annotations
import numpy as np
import pandas as pd
import math
from aaindex import aaindex1
"""
References
==========
[1] B. Hollas, “An analysis of the autocorrelation descriptor for molecules,”
J. Math. Chem., vol. 33, no. 2, pp. 91–101, 2003.
[2] S. A. K. Ong, H. H. Lin, Y. Z. Chen, Z. R. Li, and Z. Cao, “Efficacy
of different protein descriptors in predicting protein functional families,”
BMC Bioinformatics, vol. 8, p. 300, 2007.
[3] D. Raimondi, G. Orlando, W. F. Vranken, and Y. Moreau,
“Exploring the limitations of biophysical propensity scales coupled with
machine learning for protein sequence analysis,” Sci. Rep., vol. 9, no. 1, p. 16932, 2019.
"""
#list of amino acids
from ._constants import amino_acids, _validate_sequence
############################### MoreauBroto Autocorrelation ###############################
[docs]
def moreaubroto_autocorrelation(sequence: str, lag: int = 30,
properties: list[str] = ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102",
"CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], normalize: bool = True) -> pd.DataFrame:
"""
Calculate MoreauBrotoAuto Autocorrelation (MBAuto) descriptor for sequence.
Autocorrelation descriptors are a class of topological descriptors,
also known as molecular connectivity indices, that describe the level of
correlation between two objects (protein or peptide sequences) in terms of
their specific structural or physicochemical properties, which are
defined based on the distribution of amino acid properties along the sequence.
By default, 8 amino acid properties are used for deriving the descriptors.
The derivations and detailed explanations of this type of descriptor is
outlined in [1].
The MBAuto descriptor is a type of Autocorrelation descriptor that uses the
property values as the basis for measurement [2]. Each autocorrelation will
generate the number of features depending on the lag value and number of
properties input with total features = lag * number of properties. The output
autocorrelation can also be normalized by setting the normalize parameter to
true, this occurs by default. Using the default 8 properties with default lag
value of 30, 240 features are generated, the default 8 properties are:
AccNo. CIDH920105 - Normalized Average Hydrophobicity Scales
AccNo. BHAR880101 - Average Flexibility Indices
AccNo. CHAM820101 - Polarizability Parameter
AccNo. CHAM820102 - Free Energy of Solution in Water, kcal/mole
AccNo. CHOC760101 - Residue Accessible Surface Area in Tripeptide
AccNo. BIGC670101 - Residue Volume
AccNo. CHAM810101 - Steric Parameter
AccNo. DAYM780201 - Relative Mutability
Parameters
==========
:sequence: str
protein sequence.
:lag: int (default=30)
A value for a lag, the max value is equal to the length of the shortest peptide minus one.
:properties: list
list of AAI index record codes/accession numbers for the physicochemical properties to use in the calculation. Defaults to the 8 standard AAIndex properties listed above.
:normalize: bool (default=True)
rescale/normalize MoreauBroto Autocorrelation values into range of 0-1.
Returns
=======
:moreaubroto_autocorrelation_df: pd.Dataframe
pandas Dataframe of MBAuto values for protein sequence. Output will
be of the shape 1 x N, where N is the number of features calculated from
the descriptor and 1 is the input sequence. By default, the shape will
be 1 x 240 (30 features per property - using 8 properties and lag=30).
"""
#validate input protein sequence
sequence = _validate_sequence(sequence)
#validate lag, set default lag if invalid value input
if (lag>=len(sequence) or (lag<0) or not (isinstance(lag, int))):
lag = 30
#validate that at least 1 property input to function
if (properties == "" or properties == []):
raise ValueError('At least one property must be passed into function.')
#if 1 property passed into function, wrap it to an iterable list
if (isinstance(properties, str)):
properties = [properties]
#initialise dicts to store AAI properties and values
aai_properties = {}
for prop in properties:
#raise value error if accession number not found in aaindex1
if not (prop in aaindex1.record_codes()):
raise ValueError(f"Property {prop} not found in list of available properties in the aaindex1:\n{properties}.")
aai_properties[prop] = {}
#iterate through list of properties, getting property values for accession numbers from AAIndex1
for prop in list(aai_properties.keys()):
#get property values from AAIndex
aaindex1[prop].values.pop('-', None)
prop_amino_acid_values = aaindex1[prop].values
#normalise property values if applicable, calculate mean and std dev
aai_property_vals = {}
if (normalize):
for i, j in prop_amino_acid_values.items():
aai_property_vals[i] = (j - (sum(prop_amino_acid_values.values()) /
len(prop_amino_acid_values.values()))) / _std(prop_amino_acid_values.values(), ddof=0)
else:
#use raw (unnormalized) property values
for i, j in prop_amino_acid_values.items():
aai_property_vals[i] = j
aa_counter = 0
#assign property and associated amino acid values to aai_property_vals array
for i, j in aai_property_vals.items():
aai_property_vals[amino_acids[aa_counter]] = aai_property_vals[i]
aa_counter+=1
aai_properties[prop] = aai_property_vals
#store resultant autocorrelation values
moreaubroto_autocorrelation = {}
#iterate through list of properties, calculating autocorrelation values for the sequence, append to results dict
for key in aai_properties:
temp = 0
for i in range(1, lag+1):
for j in range(len(sequence)-i):
temp = temp + aai_properties[key][sequence[j]] * aai_properties[key][sequence[j+1]]
if (len(sequence) - i == 0):
moreaubroto_autocorrelation["MBAuto_" + key + "_" + str(i)] = round(
temp / (len(sequence)), 3)
else:
moreaubroto_autocorrelation["MBAuto_" + key + "_" + str(i)] = round(
temp / (len(sequence) - i), 3)
#transform values and columns to DataFrame
moreaubroto_autocorrelation_df = pd.DataFrame([list(moreaubroto_autocorrelation.values())],
columns=list(moreaubroto_autocorrelation.keys()))
return moreaubroto_autocorrelation_df
################################## Moran Autocorrelation ##################################
[docs]
def moran_autocorrelation(sequence: str, lag: int = 30,
properties: list[str] = ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102",
"CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], normalize: bool = True) -> pd.DataFrame:
"""
Refer to MBAuto docstring for autocorrelation explanation.
Moran autocorrelation (MAuto) utilizes property deviations from the
average values [2].
Parameters
==========
:sequence: str
protein sequence.
:lag: int (default=30)
A value for a lag, the max value is equal to the length of the shortest peptide minus one.
:properties: list
list of AAI index record codes/accession numbers for the physicochemical properties to use in the calculation. Defaults to the same 8 standard AAIndex properties as MBAuto.
:normalize: bool (default=True)
rescale/normalize MoreauBroto Autocorrelation values into range of 0-1.
Returns
=======
:moran_autocorrelation_df: pd.DataFrame
pandas Dataframe of MAuto values for protein sequence. Output will
be of the shape 1 x N, where N is the number of features calculated from
the descriptor and 1 is the input sequence. By default, the shape will
be 1 x 240 (30 features per property - using 8 properties and lag=30).
"""
#validate input protein sequence
sequence = _validate_sequence(sequence)
#validate lag, set default lag if invalid value input
if (lag>=len(sequence) or (lag<0) or not (isinstance(lag, int))):
lag = 30
#validate at least 1 property input to function
if (properties == "" or properties == []):
raise ValueError('At least one property must be passed into function.')
#if 1 property passed into function, wrap it to an iterable list
if (isinstance(properties, str)):
properties = [properties]
#initialise dicts to store AAI properties and values
aai_properties = {}
for prop in properties:
#raise value error if accession number not found in aaindex1
if not (prop in aaindex1.record_codes()):
raise ValueError(f"Property {prop} not found in list of available properties in the aaindex1:\n{properties}.")
aai_properties[prop] = {}
#iterate through list of properties, getting property values for accession numbers from AAIndex1
for prop in list(aai_properties.keys()):
#get property values from AAIndex
aaindex1[prop].values.pop('-', None)
prop_aminoacid_values = aaindex1[prop].values
aai_property_vals = {}
#normalise property values, calculate mean and std dev
if (normalize):
for i, j in prop_aminoacid_values.items():
aai_property_vals[i] = (j - (sum(prop_aminoacid_values.values()) /
len(prop_aminoacid_values.values()))) / _std(prop_aminoacid_values.values(), ddof=0)
else:
#use raw (unnormalized) property values
for i, j in prop_aminoacid_values.items():
aai_property_vals[i] = j
aa_counter = 0
#assign property and associated amino acid values to aai_property_vals array
for i, j in aai_property_vals.items():
aai_property_vals[amino_acids[aa_counter]] = aai_property_vals[i]
aa_counter+=1
aai_properties[prop] = aai_property_vals
#store resultant autocorrelation values
moran_autocorrelation = {}
#iterate through list of properties, calculating autocorrelation values for the sequence, append to results dict
for key in aai_properties:
cds = 0
for aa in amino_acids:
cds = cds + sequence.count(aa) * aai_properties[key][aa]
prop_mean = cds / len(sequence)
cc = []
for aa in sequence:
cc.append(aai_properties[key][aa])
k = (_std(cc, ddof=0)) ** 2
for i in range(1, lag+1):
temp = 0
for j in range(len(sequence) - i):
temp = temp + (aai_properties[key][sequence[j]] - prop_mean) * (
aai_properties[key][sequence[j+i]] - prop_mean)
if len(sequence) - i == 0:
moran_autocorrelation["MAuto_" + key + "_" + str(i)] = round(
temp / ((len(sequence)) / k), 5)
else:
moran_autocorrelation["MAuto_" + key + "_" + str(i)] = round(
temp / ((len(sequence) - i) / k), 5)
#transform values and columns to DataFrame
moran_autocorrelation_df = pd.DataFrame([list(moran_autocorrelation.values())],
columns=list(moran_autocorrelation.keys()))
return moran_autocorrelation_df
################################## Geary Autocorrelation ##################################
[docs]
def geary_autocorrelation(sequence: str, lag: int = 30,
properties: list[str] = ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102",
"CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], normalize: bool = True) -> pd.DataFrame:
"""
Refer to MBAuto docstring for autocorrelation description.
Geary Autocorrelation (GAuto) utilizes the square-difference of property
values instead of vector-products (of property values or deviations) [2].
Parameters
==========
:sequence: str
protein sequence.
:lag: int (default=30)
A value for a lag, the max value is equal to the length of the shortest peptide minus one.
:properties: list
list of AAI index record codes/accession numbers for the physicochemical properties to use in the calculation. Defaults to the same 8 standard AAIndex properties as MBAuto.
:normalize: bool (default=True)
rescale/normalize MoreauBroto Autocorrelation values into range of 0-1.
Returns
=======
:geary_autocorrelation_df: pd.DataFrame
pandas Dataframe of MAuto values for protein sequence. Output will
be of the shape 1 x N, where N is the number of features calculated from
the descriptor and 1 is the input sequence. By default, the shape will
be 1 x 240 (30 features per property - using 8 properties and lag=30).
"""
#validate input protein sequence
sequence = _validate_sequence(sequence)
#validate lag, set default lag if invalid value input
if (lag>=len(sequence) or (lag<0) or not (isinstance(lag, int))):
lag = 30
#validate at least 1 property input to function
if (properties == "" or properties == []):
raise ValueError('At least one property must be passed into function.')
#if 1 property passed into function, wrap it to an iterable list
if (isinstance(properties, str)):
properties = [properties]
#initialise dicts to store AAI properties and values
aai_properties = {}
for prop in properties:
#raise value error if accession number not found in aaindex1
if not (prop in aaindex1.record_codes()):
raise ValueError(f"Property {prop} not found in list of available properties in the aaindex1:\n{properties}.")
aai_properties[prop] = {}
#iterate through list of properties, getting property values for accession numbers from AAIndex1
for prop in list(aai_properties.keys()):
#get property values from AAIndex & reshape
aaindex1[prop].values.pop('-', None)
prop_aminoacid_values = aaindex1[prop].values
aai_property_vals = {}
#normalise property values, calculate mean and std dev
if (normalize):
for i, j in prop_aminoacid_values.items():
aai_property_vals[i] = (j - (sum(prop_aminoacid_values.values()) /
len(prop_aminoacid_values.values()))) / _std(prop_aminoacid_values.values(), ddof=0)
else:
#use raw (unnormalized) property values
for i, j in prop_aminoacid_values.items():
aai_property_vals[i] = j
aa_counter = 0
#assign property and associated amino acid values to aai_property_vals array
for i, j in aai_property_vals.items():
aai_property_vals[amino_acids[aa_counter]] = aai_property_vals[i]
aa_counter+=1
aai_properties[prop] = aai_property_vals
#store resultant autocorrelation values
geary_autocorrelation = {}
#iterate through list of properties, calculating autocorrelation values for the sequence, append to results dict
for key in aai_properties:
cc = []
for aa in sequence:
cc.append(aai_properties[key][aa])
k = ((np.std(cc, ddof=0)) ** 2) * len(sequence) / (len(sequence) - 1)
for i in range(1, lag+1):
temp = 0
for j in range(len(sequence) - i):
temp = (temp + (
aai_properties[key][sequence[j]] - aai_properties[key][sequence[j+i]]) ** 2)
if len(sequence) - i == 0:
geary_autocorrelation["GAuto_" + key + "_" + str(i)] = round(
temp / (2* (len(sequence))) / k, 3)
else:
geary_autocorrelation["GAuto_" + key + "_" + str(i)] = round(
temp / (2* (len(sequence) -i)) / k, 3)
#transform values and columns to DataFrame
geary_autocorrelation_df = pd.DataFrame([list(geary_autocorrelation.values())],
columns=list(geary_autocorrelation.keys()))
return geary_autocorrelation_df
def _std(array: list[float], ddof: int = 1) -> float:
"""
Calculate the standard deviation of the array data, with associated means Delta Degrees
of Freedom (ddof).
Parameters
==========
:array: np.array
numpy array of float values.
:ddof: int (default=1)
means delta degrees of freedom.
Returns
=======
:result: np.array
input array after standard deviation transformation.
"""
return math.sqrt(sum([math.pow(i - sum(array) / len(array), 2) for i in array])
/ (len(array) - ddof))