Source code for protpy.ctd

################################################################################
#################                    CTD                       #################
################################################################################

from __future__ import annotations

import pandas as pd
import math
import copy
from difflib import get_close_matches

#list of amino acids
from ._constants import amino_acids, _validate_sequence

"""
References
==========
[1] Inna Dubchak, Ilya Muchink, Stephen R.Holbrook and Sung-Hou Kim.
    Prediction of protein folding class using global description of amino
    acid sequence. Proc.Natl. Acad.Sci.USA, 1995, 92, 8700-8704.
[2] Inna Dubchak, Ilya Muchink, Christopher Mayor, Igor Dralyuk and Sung-Hou
    Kim. Recognition of a Protein Fold in the Context of the SCOP
    classification. Proteins: Structure, Function and Genetics, 1999, 35, 401-407.
[3] Gromiha, M. M. (2010). Protein Sequence Analysis. In M. M. Gromiha (Ed.), 
    Protein Bioinformatics (pp. 29–62). Elsevier.
"""

hydrophobicity = {"name": "hydrophobicity", "1": "RKEDQN", "2": "GASTPHY", "3": "CLVIMFW"}
# '1' -> Polar; '2' -> Neutral, '3' -> Hydrophobicity

normalized_vdwv = {"name": "normalized_vdwv", "1": "GASTPD", "2": "NVEQIL", "3": "MHKFRYW"}
# '1' -> (0-2.78); '2' -> (2.95-4.0), '3' -> (4.03-8.08)

polarity = {"name": "polarity", "1": "LIFWCMVY", "2": "CPNVEQIL", "3": "KMHFRYW"}
# '1' -> (4.9-6.2); '2' -> (8.0-9.2), '3' -> (10.4-13.0)

charge = {"name": "charge", "1": "KR", "2": "ANCQGHILMFPSTWYV", "3": "DE"}
# '1' -> Positive; '2' -> Neutral, '3' -> Negative

secondary_struct = {"name": "secondary_struct", "1": "EALMQKRH", "2": "VIYCWFT", "3": "GNPSD"}
# '1' -> Helix; '2' -> Strand, '3' -> coil

solvent_accessibility = {"name": "solvent_accessibility", "1": "ALFCGIVW", "2": "RKQEND", "3": "MPSTHY"}
# '1' -> Buried; '2' -> Exposed, '3' -> Intermediate

polarizability = {"name": "polarizability", "1": "GASDT", "2": "CPNVEQIL", "3": "KMHFRYW"}
# '1' -> (0-0.108); '2' -> (0.128-0.186), '3' -> (0.219-0.409)

#object of physicochemical properties to use for calculating CTD descriptors
ctd_properties = {
    "hydrophobicity": hydrophobicity,
    "normalized_vdwv": normalized_vdwv,
    "polarity": polarity,
    "charge": charge,
    "secondary_struct": secondary_struct,
    "solvent_accessibility": solvent_accessibility,
    "polarizability": polarizability
    }


[docs]
def str_to_num(sequence: str, property: dict[str, str]) -> str:
    """
    Convert sequences str to number from input physicochemical property.

    Parameters
    ==========
    :sequence: str
        protein sequence.
    :property: str
        physicochemical property name to use when calculating descriptor.

    Returns
    =======
    :sequence_converted: str
        converted protein sequence into numerical format. 
    """
    #validate input protein sequence
    sequence = _validate_sequence(sequence)

    #create deep copy of sequence
    sequence_converted = copy.deepcopy(sequence)

    #convert amino acid into corresponding CTD keys
    for key, value in list(property.items()):
        if (key == "name"):
            continue
        for index in value:
            sequence_converted = sequence_converted.replace(index, key)

    return sequence_converted


##################################### CTD Composition #####################################


[docs]
def ctd_composition(sequence: str, property: str = "hydrophobicity") -> pd.DataFrame:
    """
    Calculate composition physicochemical/structural descriptor. Composition 
    is determined as the number of amino acids of a particular property 
    divided by the total number of amino acids. The shape of the output will be 
    1 x 3, with 3 features being generated per sequence.

    Parameters
    ==========
    :sequence: str
        protein sequence.
    :property: str (default="hydrophocity")
        physicochemical property name to use when calculating descriptor.

    Returns
    =======
    :ctd_composition_df: pd.DataFrame
        dataframe of calculated composition values for sequence using
        selected physicochemical property. Output will be of shape 1 x 3,
        with 3 features being generated per sequence.
    """
    #validate input protein sequence
    sequence = _validate_sequence(sequence)

    #get closest matched CTD property - using cutoff of 0.8
    property_matches = get_close_matches(property, ctd_properties.keys(), cutoff=0.8)

    #raise error if no matching property found
    if (property_matches != []):
        prop = ctd_properties[property_matches[0]]
    else:
        raise ValueError(f"Invalid property '{property}'. Available properties are: {list(ctd_properties.keys())}.")
    
    #convert sequence to number
    seq = str_to_num(sequence, prop)

    ctd_composition_ = {}
    
    #calculate descriptor values, append to ctd_composition_ dict
    ctd_composition_['CTD_C_01_' + prop["name"]] = round(float(seq.count("1"))/len(sequence), 3) 
    ctd_composition_['CTD_C_02_' + prop["name"]] = round(float(seq.count("2"))/len(sequence), 3)
    ctd_composition_['CTD_C_03_' + prop["name"]] = round(float(seq.count("3"))/len(sequence), 3)

    #transform values and columns to DataFrame
    ctd_composition_df = pd.DataFrame([list(ctd_composition_.values())], columns=list(ctd_composition_.keys()))

    return ctd_composition_df


##################################### CTD Transition ######################################


[docs]
def ctd_transition(sequence: str, property: str = "hydrophobicity") -> pd.DataFrame:
    """
    Calculate transition physicochemical/structural descriptor. Transition 
    is determined as the number of transitions from a particular property 
    to different property divided by (total number of amino acids − 1). 
    The shape of the output will be 1 x 3, with 3 features being generated 
    per sequence.

    Parameters
    ==========
    :sequence: str
        protein sequence.
    :property: str (default="hydrophocity")
        physicochemical property name to use when calculating descriptor.

    Returns
    =======
    :ctd_transition_df: pd.DataFrame
        dataframe of calculated transition values for sequence using
        selected physicochemical property. Output will be of shape 1 x 3,
        with 3 features being generated per sequence.
    """
    #validate input protein sequence
    sequence = _validate_sequence(sequence)

    #get closest matched CTD property  - using cutoff of 0.8
    property_matches = get_close_matches(property, ctd_properties.keys(), cutoff=0.8)

    #raise error if no matching property found
    if (property_matches != []):
        prop = ctd_properties[property_matches[0]]
    else:
        raise ValueError(f"Invalid property '{property}'. Available properties are: {list(ctd_properties.keys())}.")

    #convert sequence to number
    seq = str_to_num(sequence, prop)
    
    ctd_transition_ = {}

    #calculate descriptor values, append to ctd_transition_ dict
    ctd_transition_["CTD_T_12_" + prop["name"]] = round(
        float(seq.count("12") + seq.count("21")) / (len(sequence)-1), 3)
    ctd_transition_["CTD_T_13_" + prop["name"]] = round(
        float(seq.count("13") + seq.count("31")) / (len(sequence)-1), 3)
    ctd_transition_["CTD_T_23_" + prop["name"]] = round(
        float(seq.count("23") + seq.count("32")) / (len(sequence)-1), 3)

    #transform values and columns to DataFrame
    ctd_transition_df = pd.DataFrame([list(ctd_transition_.values())], columns=list(ctd_transition_.keys()))

    return ctd_transition_df


#################################### CTD Distribution #####################################


[docs]
def ctd_distribution(sequence: str, property: str = "hydrophobicity") -> pd.DataFrame:
    """
    Calculate distribution physicochemical/structural descriptor. Distribution 
    is the chain length within which the first, 25%, 50%, 75% and 100% of the 
    amino acids of a particular property are located. The shape of the output 
    will be 1 x 15, with 15 features being generated per sequence.

    Parameters
    ==========
    :sequence: str
        protein sequence.
    :property: str (default="hydrophocity")
        physicochemical property name to use when calculating descriptor.

    Returns
    =======
    :ctd_distribution_df: pd.DataFrame
        dataframe of calculated distribution values for sequence using
        selected physicochemical property. Output will be of shape 1 x 15,
        with 15 features being generated per sequence.
    """
    #validate input protein sequence
    sequence = _validate_sequence(sequence)

    #get closest matched CTD property - using cutoff of 0.8
    property_matches = get_close_matches(property, ctd_properties.keys(), cutoff=0.8)

    #raise error if no matching property found
    if (property_matches != []):
        prop = ctd_properties[property_matches[0]]
    else:
        raise ValueError(f"Invalid property '{property}'. Available properties are: {list(ctd_properties.keys())}.")

    #convert sequence to number
    seq = str_to_num(sequence, prop)
    
    ctd_distribution_ = {}
    
    #iterate through sequence, calculating distribution descriptor values using property
    for key, value in prop.items():
       if (key=="name"):
        continue
       num = seq.count(key)
       ink = 1
       indexk = 0
       cds = []
       while ink <= num:
           indexk = seq.find(key,indexk) + 1
           cds.append(indexk)
           ink = ink + 1
       if cds == []:
           ctd_distribution_["CTD_D_0" + key + "_001_" + prop["name"]] = 0
           ctd_distribution_["CTD_D_0" + key + "_025_" + prop["name"]] = 0
           ctd_distribution_["CTD_D_0" + key + "_050_" + prop["name"]] = 0
           ctd_distribution_["CTD_D_0" + key + "_075_" + prop["name"]] = 0
           ctd_distribution_["CTD_D_0" + key + "_100_" + prop["name"]] = 0
       else:
           ctd_distribution_["CTD_D_0" + key + "_001_" + prop["name"]] = round(
            float(cds[0]) / len(seq) * 100, 3
           )
           ctd_distribution_["CTD_D_0" + key + "_025_" + prop["name"]] = round(
            float(cds[int(math.floor(num * 0.25)) - 1]) / len(seq) * 100, 3
           )
           ctd_distribution_["CTD_D_0" + key + "_050_" + prop["name"]] = round(
            float(cds[int(math.floor(num * 0.5)) - 1]) / len(seq) * 100, 3
           )
           ctd_distribution_["CTD_D_0" + key + "_075_" + prop["name"]] = round(
            float(cds[int(math.floor(num * 0.75)) - 1]) / len(seq) * 100, 3
           )
           ctd_distribution_["CTD_D_0" + key + "_100_" + prop["name"]] = round(
            float(cds[-1]) / len(seq) * 100, 3)

    #transform values and columns to DataFrame
    ctd_distribution_df = pd.DataFrame([list(ctd_distribution_.values())], columns=list(ctd_distribution_.keys()))

    return ctd_distribution_df



[docs]
def ctd_(sequence: str, property: str = "hydrophobicity", all_ctd: bool = True) -> pd.DataFrame:
    """
    Calculate all Composition, Transition and Distribution (CTD) features of protein sequences.
    Composition is the number of amino acids of a particular property (e.g., hydrophobicity)
    divided by the total number of amino acids in a protein sequence. Transition
    characterizes the percent frequency with which amino acids of a particular
    property is followed by amino acids of a different property. Distribution
    measures the chain length within which the first, 25%, 50%, 75%, and 100% of
    the amino acids of a particular property are located, respectively [1, 2].

    CTD properties available are: Polarizability, Solvent Accessibility, Secondary 
    Structure, Charge, Polarity, Normalized VDWV, Hydrophobicity. Each property generates an 
    output of shape 1 x 21, 3/21 will be Composition, 3/21 will be Transition, 15/21 will be
    Distribution. When Calculating all available features the generated output will be of shape 
    1 x 147, 21/147 will be composition, 21/147 will be transition and the remaining 105 are 
    distribution.
    
    Parameters
    ==========
    :sequence: str
        protein sequence.
    :property: str (default="hydrophocity")
        physicochemical property name to use when calculating descriptor.
    :ctd: bool
        calculate all CTD descriptors and concatenate together.

    Returns
    =======
    :ctd_df: pd.DataFrame
        dataframe of CTD descriptor values for all protein sequences. DataFrame will
        be of the shape 1 x 147, where 147 is the total number of features calculated from
        the CTD descriptors per sequence, with each property generating an output of 1 x 21.
    """
    #validate input protein sequence
    sequence = _validate_sequence(sequence)
    
    #initialise ctd dataframes
    comp_df = pd.DataFrame()
    trans_df = pd.DataFrame()
    distr_df = pd.DataFrame()

    #if using single property, calculate each of the CTD descriptors individually
    if not all_ctd:
        comp_df = ctd_composition(sequence, property=property)
        trans_df = ctd_transition(sequence, property=property)
        distr_df = ctd_distribution(sequence, property=property)
    else:
        #if using all calculable properties, calculate CTD descriptors for each property
        for prop in ctd_properties:
            comp = ctd_composition(sequence, property=prop)
            comp_df = pd.concat([comp_df, comp], axis=1)
            trans = ctd_transition(sequence, property=prop)
            trans_df = pd.concat([trans_df, trans], axis=1)
            distr = ctd_distribution(sequence, property=prop)
            distr_df = pd.concat([distr_df, distr], axis=1)

    #concatenate all descriptors
    ctd = pd.concat([comp_df, trans_df, distr_df], axis=1)
    
    return ctd