Source code for protpy.autocorrelation

################################################################################
#############                   Autocorrelation                   ##############
################################################################################

from __future__ import annotations

import numpy as np
import pandas as pd
import math
from aaindex import aaindex1

"""
References
==========
[1] B. Hollas, “An analysis of the autocorrelation descriptor for molecules,”
    J. Math. Chem., vol. 33, no. 2, pp. 91–101, 2003.

[2] S. A. K. Ong, H. H. Lin, Y. Z. Chen, Z. R. Li, and Z. Cao, “Efficacy
    of different protein descriptors in predicting protein functional families,”
    BMC Bioinformatics, vol. 8, p. 300, 2007.

[3] D. Raimondi, G. Orlando, W. F. Vranken, and Y. Moreau,
    “Exploring the limitations of biophysical propensity scales coupled with
    machine learning for protein sequence analysis,” Sci. Rep., vol. 9, no. 1, p. 16932, 2019.  
"""

#list of amino acids
from ._constants import amino_acids, _validate_sequence
    
############################### MoreauBroto Autocorrelation ###############################


[docs]
def moreaubroto_autocorrelation(sequence: str, lag: int = 30,
    properties: list[str] = ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102",
    "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], normalize: bool = True) -> pd.DataFrame:
    """
    Calculate MoreauBrotoAuto Autocorrelation (MBAuto) descriptor for sequence.
    Autocorrelation descriptors are a class of topological descriptors,
    also known as molecular connectivity indices, that describe the level of
    correlation between two objects (protein or peptide sequences) in terms of
    their specific structural or physicochemical properties, which are
    defined based on the distribution of amino acid properties along the sequence.
    By default, 8 amino acid properties are used for deriving the descriptors. 
    The derivations and detailed explanations of this type of descriptor is 
    outlined in [1]. 
    
    The MBAuto descriptor is a type of Autocorrelation descriptor that uses the
    property values as the basis for measurement [2]. Each autocorrelation will 
    generate the number of features depending on the lag value and number of 
    properties input with total features = lag * number of properties. The output 
    autocorrelation can also be normalized by setting the normalize parameter to 
    true, this occurs by default. Using the default 8 properties with default lag 
    value of 30, 240 features are generated, the default 8 properties are:

    AccNo. CIDH920105 - Normalized Average Hydrophobicity Scales
    AccNo. BHAR880101 - Average Flexibility Indices
    AccNo. CHAM820101 - Polarizability Parameter
    AccNo. CHAM820102 - Free Energy of Solution in Water, kcal/mole
    AccNo. CHOC760101 - Residue Accessible Surface Area in Tripeptide
    AccNo. BIGC670101 - Residue Volume
    AccNo. CHAM810101 - Steric Parameter
    AccNo. DAYM780201 - Relative Mutability

    Parameters
    ==========
    :sequence: str
        protein sequence.
    :lag: int (default=30)
        A value for a lag, the max value is equal to the length of the shortest peptide minus one.
    :properties: list
        list of AAI index record codes/accession numbers for the physicochemical properties to use in the calculation. Defaults to the 8 standard AAIndex properties listed above.
    :normalize: bool (default=True)
        rescale/normalize MoreauBroto Autocorrelation values into range of 0-1.

    Returns
    =======
    :moreaubroto_autocorrelation_df: pd.Dataframe
        pandas Dataframe of MBAuto values for protein sequence. Output will
        be of the shape 1 x N, where N is the number of features calculated from
        the descriptor and 1 is the input sequence. By default, the shape will
        be 1 x 240 (30 features per property - using 8 properties and lag=30).
    """
    #validate input protein sequence
    sequence = _validate_sequence(sequence)

    #validate lag, set default lag if invalid value input
    if (lag>=len(sequence) or (lag<0) or not (isinstance(lag, int))):
        lag = 30

    #validate that at least 1 property input to function
    if (properties == "" or properties == []):
        raise ValueError('At least one property must be passed into function.')

    #if 1 property passed into function, wrap it to an iterable list
    if (isinstance(properties, str)):
        properties = [properties]
    
    #initialise dicts to store AAI properties and values
    aai_properties = {}
    for prop in properties:
        #raise value error if accession number not found in aaindex1
        if not (prop in aaindex1.record_codes()):
            raise ValueError(f"Property {prop} not found in list of available properties in the aaindex1:\n{properties}.")
        aai_properties[prop] = {}

    #iterate through list of properties, getting property values for accession numbers from AAIndex1
    for prop in list(aai_properties.keys()):
        #get property values from AAIndex 
        aaindex1[prop].values.pop('-', None)
        prop_amino_acid_values = aaindex1[prop].values
     
        #normalise property values if applicable, calculate mean and std dev
        aai_property_vals = {}
        if (normalize):
            for i, j in prop_amino_acid_values.items():
                aai_property_vals[i] = (j - (sum(prop_amino_acid_values.values()) / 
                    len(prop_amino_acid_values.values()))) / _std(prop_amino_acid_values.values(), ddof=0)
        else:
            #use raw (unnormalized) property values
            for i, j in prop_amino_acid_values.items():
                aai_property_vals[i] = j
        aa_counter = 0

        #assign property and associated amino acid values to aai_property_vals array
        for i, j in aai_property_vals.items():
            aai_property_vals[amino_acids[aa_counter]] = aai_property_vals[i]
            aa_counter+=1
        aai_properties[prop] = aai_property_vals

    #store resultant autocorrelation values
    moreaubroto_autocorrelation = {}

    #iterate through list of properties, calculating autocorrelation values for the sequence, append to results dict
    for key in aai_properties:
        temp = 0
        for i in range(1, lag+1):
            for j in range(len(sequence)-i):
                temp = temp + aai_properties[key][sequence[j]] * aai_properties[key][sequence[j+1]]
            if (len(sequence) - i == 0):
                moreaubroto_autocorrelation["MBAuto_" + key + "_" + str(i)] = round(
                    temp / (len(sequence)), 3)
            else:
                moreaubroto_autocorrelation["MBAuto_" + key + "_" + str(i)] = round(
                    temp / (len(sequence) - i), 3)

    #transform values and columns to DataFrame
    moreaubroto_autocorrelation_df = pd.DataFrame([list(moreaubroto_autocorrelation.values())], 
        columns=list(moreaubroto_autocorrelation.keys()))

    return moreaubroto_autocorrelation_df


################################## Moran Autocorrelation ##################################


[docs]
def moran_autocorrelation(sequence: str, lag: int = 30,
    properties: list[str] = ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102",
    "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], normalize: bool = True) -> pd.DataFrame:
    """
    Refer to MBAuto docstring for autocorrelation explanation.
    Moran autocorrelation (MAuto) utilizes property deviations from the
    average values [2].

    Parameters
    ==========
    :sequence: str
        protein sequence.
    :lag: int (default=30)
        A value for a lag, the max value is equal to the length of the shortest peptide minus one.
    :properties: list
        list of AAI index record codes/accession numbers for the physicochemical properties to use in the calculation. Defaults to the same 8 standard AAIndex properties as MBAuto.
    :normalize: bool (default=True)
        rescale/normalize MoreauBroto Autocorrelation values into range of 0-1.

    Returns
    =======
    :moran_autocorrelation_df: pd.DataFrame
        pandas Dataframe of MAuto values for protein sequence. Output will
        be of the shape 1 x N, where N is the number of features calculated from
        the descriptor and 1 is the input sequence. By default, the shape will 
        be 1 x 240 (30 features per property - using 8 properties and lag=30).  
    """
    #validate input protein sequence
    sequence = _validate_sequence(sequence)

    #validate lag, set default lag if invalid value input
    if (lag>=len(sequence) or (lag<0) or not (isinstance(lag, int))):
        lag = 30

    #validate at least 1 property input to function
    if (properties == "" or properties == []):
        raise ValueError('At least one property must be passed into function.')

    #if 1 property passed into function, wrap it to an iterable list
    if (isinstance(properties, str)):
        properties = [properties]

    #initialise dicts to store AAI properties and values
    aai_properties = {}
    for prop in properties:
        #raise value error if accession number not found in aaindex1
        if not (prop in aaindex1.record_codes()):
            raise ValueError(f"Property {prop} not found in list of available properties in the aaindex1:\n{properties}.")
        aai_properties[prop] = {}

    #iterate through list of properties, getting property values for accession numbers from AAIndex1
    for prop in list(aai_properties.keys()):
        #get property values from AAIndex
        aaindex1[prop].values.pop('-', None)
        prop_aminoacid_values = aaindex1[prop].values

        aai_property_vals = {}

        #normalise property values, calculate mean and std dev        
        if (normalize):
            for i, j in prop_aminoacid_values.items():
                aai_property_vals[i] = (j - (sum(prop_aminoacid_values.values()) / 
                    len(prop_aminoacid_values.values()))) / _std(prop_aminoacid_values.values(), ddof=0)
        else:
            #use raw (unnormalized) property values
            for i, j in prop_aminoacid_values.items():
                aai_property_vals[i] = j
        aa_counter = 0

        #assign property and associated amino acid values to aai_property_vals array
        for i, j in aai_property_vals.items():
            aai_property_vals[amino_acids[aa_counter]] = aai_property_vals[i]
            aa_counter+=1
        aai_properties[prop] = aai_property_vals

    #store resultant autocorrelation values
    moran_autocorrelation = {}

    #iterate through list of properties, calculating autocorrelation values for the sequence, append to results dict
    for key in aai_properties:
        cds = 0
        for aa in amino_acids:
            cds = cds + sequence.count(aa) * aai_properties[key][aa]
        prop_mean = cds / len(sequence)
        cc = []
        for aa in sequence:
            cc.append(aai_properties[key][aa])
        k = (_std(cc, ddof=0)) ** 2
        for i in range(1, lag+1):
            temp = 0
            for j in range(len(sequence) - i):
                temp = temp + (aai_properties[key][sequence[j]] - prop_mean) * (
                    aai_properties[key][sequence[j+i]] - prop_mean)
            if len(sequence) - i == 0:
                moran_autocorrelation["MAuto_" + key + "_" + str(i)] = round(
                    temp / ((len(sequence)) / k), 5)
            else:
                moran_autocorrelation["MAuto_" + key + "_" + str(i)] = round(
                    temp / ((len(sequence) - i) / k), 5)

    #transform values and columns to DataFrame
    moran_autocorrelation_df = pd.DataFrame([list(moran_autocorrelation.values())], 
        columns=list(moran_autocorrelation.keys()))

    return moran_autocorrelation_df


################################## Geary Autocorrelation ##################################


[docs]
def geary_autocorrelation(sequence: str, lag: int = 30,
    properties: list[str] = ["CIDH920105", "BHAR880101", "CHAM820101", "CHAM820102",
    "CHOC760101", "BIGC670101", "CHAM810101", "DAYM780201"], normalize: bool = True) -> pd.DataFrame:
    """
    Refer to MBAuto docstring for autocorrelation description.
    Geary Autocorrelation (GAuto) utilizes the square-difference of property
    values instead of vector-products (of property values or deviations) [2].
    
    Parameters
    ==========
    :sequence: str
        protein sequence.
    :lag: int (default=30)
        A value for a lag, the max value is equal to the length of the shortest peptide minus one.
    :properties: list
        list of AAI index record codes/accession numbers for the physicochemical properties to use in the calculation. Defaults to the same 8 standard AAIndex properties as MBAuto.
    :normalize: bool (default=True)
        rescale/normalize MoreauBroto Autocorrelation values into range of 0-1.

    Returns
    =======
    :geary_autocorrelation_df: pd.DataFrame
        pandas Dataframe of MAuto values for protein sequence. Output will
        be of the shape 1 x N, where N is the number of features calculated from
        the descriptor and 1 is the input sequence. By default, the shape will 
        be 1 x 240 (30 features per property - using 8 properties and lag=30).  
    """
    #validate input protein sequence
    sequence = _validate_sequence(sequence)
            
    #validate lag, set default lag if invalid value input
    if (lag>=len(sequence) or (lag<0) or not (isinstance(lag, int))):
        lag = 30

    #validate at least 1 property input to function
    if (properties == "" or properties == []):
        raise ValueError('At least one property must be passed into function.')

    #if 1 property passed into function, wrap it to an iterable list
    if (isinstance(properties, str)):
        properties = [properties]
        
    #initialise dicts to store AAI properties and values
    aai_properties = {}
    for prop in properties:
        #raise value error if accession number not found in aaindex1
        if not (prop in aaindex1.record_codes()):
            raise ValueError(f"Property {prop} not found in list of available properties in the aaindex1:\n{properties}.")
        aai_properties[prop] = {}

    #iterate through list of properties, getting property values for accession numbers from AAIndex1
    for prop in list(aai_properties.keys()):
        #get property values from AAIndex & reshape
        aaindex1[prop].values.pop('-', None)
        prop_aminoacid_values = aaindex1[prop].values

        aai_property_vals = {}

        #normalise property values, calculate mean and std dev
        if (normalize):
            for i, j in prop_aminoacid_values.items():
                aai_property_vals[i] = (j - (sum(prop_aminoacid_values.values()) / 
                    len(prop_aminoacid_values.values()))) / _std(prop_aminoacid_values.values(), ddof=0)
        else:
            #use raw (unnormalized) property values
            for i, j in prop_aminoacid_values.items():
                aai_property_vals[i] = j
        aa_counter = 0

        #assign property and associated amino acid values to aai_property_vals array
        for i, j in aai_property_vals.items():
            aai_property_vals[amino_acids[aa_counter]] = aai_property_vals[i]
            aa_counter+=1
        aai_properties[prop] = aai_property_vals

    #store resultant autocorrelation values
    geary_autocorrelation = {}

    #iterate through list of properties, calculating autocorrelation values for the sequence, append to results dict
    for key in aai_properties:
        cc = []
        for aa in sequence:
            cc.append(aai_properties[key][aa])
        k = ((np.std(cc, ddof=0)) ** 2) * len(sequence) / (len(sequence) - 1)
        for i in range(1, lag+1):
            temp = 0
            for j in range(len(sequence) - i):
                temp = (temp + (
                    aai_properties[key][sequence[j]] - aai_properties[key][sequence[j+i]]) ** 2)
            if len(sequence) - i == 0:
                geary_autocorrelation["GAuto_" + key + "_" + str(i)] = round(
                    temp / (2* (len(sequence))) / k, 3)
            else:
                geary_autocorrelation["GAuto_" + key + "_" + str(i)] = round(
                    temp / (2* (len(sequence) -i)) / k, 3)

    #transform values and columns to DataFrame
    geary_autocorrelation_df = pd.DataFrame([list(geary_autocorrelation.values())], 
        columns=list(geary_autocorrelation.keys()))

    return geary_autocorrelation_df


def _std(array: list[float], ddof: int = 1) -> float:
    """
    Calculate the standard deviation of the array data, with associated means Delta Degrees 
    of Freedom (ddof).

    Parameters
    ==========
    :array: np.array
        numpy array of float values.
    :ddof: int (default=1)
        means delta degrees of freedom.

    Returns
    =======
    :result: np.array
        input array after standard deviation transformation.
    """
    return math.sqrt(sum([math.pow(i - sum(array) / len(array), 2) for i in array]) 
        / (len(array) - ddof))