################################################################################
################# CTD #################
################################################################################
from __future__ import annotations
import pandas as pd
import math
import copy
from difflib import get_close_matches
#list of amino acids
from ._constants import amino_acids, _validate_sequence
"""
References
==========
[1] Inna Dubchak, Ilya Muchink, Stephen R.Holbrook and Sung-Hou Kim.
Prediction of protein folding class using global description of amino
acid sequence. Proc.Natl. Acad.Sci.USA, 1995, 92, 8700-8704.
[2] Inna Dubchak, Ilya Muchink, Christopher Mayor, Igor Dralyuk and Sung-Hou
Kim. Recognition of a Protein Fold in the Context of the SCOP
classification. Proteins: Structure, Function and Genetics, 1999, 35, 401-407.
[3] Gromiha, M. M. (2010). Protein Sequence Analysis. In M. M. Gromiha (Ed.),
Protein Bioinformatics (pp. 29–62). Elsevier.
"""
hydrophobicity = {"name": "hydrophobicity", "1": "RKEDQN", "2": "GASTPHY", "3": "CLVIMFW"}
# '1' -> Polar; '2' -> Neutral, '3' -> Hydrophobicity
normalized_vdwv = {"name": "normalized_vdwv", "1": "GASTPD", "2": "NVEQIL", "3": "MHKFRYW"}
# '1' -> (0-2.78); '2' -> (2.95-4.0), '3' -> (4.03-8.08)
polarity = {"name": "polarity", "1": "LIFWCMVY", "2": "CPNVEQIL", "3": "KMHFRYW"}
# '1' -> (4.9-6.2); '2' -> (8.0-9.2), '3' -> (10.4-13.0)
charge = {"name": "charge", "1": "KR", "2": "ANCQGHILMFPSTWYV", "3": "DE"}
# '1' -> Positive; '2' -> Neutral, '3' -> Negative
secondary_struct = {"name": "secondary_struct", "1": "EALMQKRH", "2": "VIYCWFT", "3": "GNPSD"}
# '1' -> Helix; '2' -> Strand, '3' -> coil
solvent_accessibility = {"name": "solvent_accessibility", "1": "ALFCGIVW", "2": "RKQEND", "3": "MPSTHY"}
# '1' -> Buried; '2' -> Exposed, '3' -> Intermediate
polarizability = {"name": "polarizability", "1": "GASDT", "2": "CPNVEQIL", "3": "KMHFRYW"}
# '1' -> (0-0.108); '2' -> (0.128-0.186), '3' -> (0.219-0.409)
#object of physicochemical properties to use for calculating CTD descriptors
ctd_properties = {
"hydrophobicity": hydrophobicity,
"normalized_vdwv": normalized_vdwv,
"polarity": polarity,
"charge": charge,
"secondary_struct": secondary_struct,
"solvent_accessibility": solvent_accessibility,
"polarizability": polarizability
}
[docs]
def str_to_num(sequence: str, property: dict[str, str]) -> str:
"""
Convert sequences str to number from input physicochemical property.
Parameters
==========
:sequence: str
protein sequence.
:property: str
physicochemical property name to use when calculating descriptor.
Returns
=======
:sequence_converted: str
converted protein sequence into numerical format.
"""
#validate input protein sequence
sequence = _validate_sequence(sequence)
#create deep copy of sequence
sequence_converted = copy.deepcopy(sequence)
#convert amino acid into corresponding CTD keys
for key, value in list(property.items()):
if (key == "name"):
continue
for index in value:
sequence_converted = sequence_converted.replace(index, key)
return sequence_converted
##################################### CTD Composition #####################################
[docs]
def ctd_composition(sequence: str, property: str = "hydrophobicity") -> pd.DataFrame:
"""
Calculate composition physicochemical/structural descriptor. Composition
is determined as the number of amino acids of a particular property
divided by the total number of amino acids. The shape of the output will be
1 x 3, with 3 features being generated per sequence.
Parameters
==========
:sequence: str
protein sequence.
:property: str (default="hydrophocity")
physicochemical property name to use when calculating descriptor.
Returns
=======
:ctd_composition_df: pd.DataFrame
dataframe of calculated composition values for sequence using
selected physicochemical property. Output will be of shape 1 x 3,
with 3 features being generated per sequence.
"""
#validate input protein sequence
sequence = _validate_sequence(sequence)
#get closest matched CTD property - using cutoff of 0.8
property_matches = get_close_matches(property, ctd_properties.keys(), cutoff=0.8)
#raise error if no matching property found
if (property_matches != []):
prop = ctd_properties[property_matches[0]]
else:
raise ValueError(f"Invalid property '{property}'. Available properties are: {list(ctd_properties.keys())}.")
#convert sequence to number
seq = str_to_num(sequence, prop)
ctd_composition_ = {}
#calculate descriptor values, append to ctd_composition_ dict
ctd_composition_['CTD_C_01_' + prop["name"]] = round(float(seq.count("1"))/len(sequence), 3)
ctd_composition_['CTD_C_02_' + prop["name"]] = round(float(seq.count("2"))/len(sequence), 3)
ctd_composition_['CTD_C_03_' + prop["name"]] = round(float(seq.count("3"))/len(sequence), 3)
#transform values and columns to DataFrame
ctd_composition_df = pd.DataFrame([list(ctd_composition_.values())], columns=list(ctd_composition_.keys()))
return ctd_composition_df
##################################### CTD Transition ######################################
[docs]
def ctd_transition(sequence: str, property: str = "hydrophobicity") -> pd.DataFrame:
"""
Calculate transition physicochemical/structural descriptor. Transition
is determined as the number of transitions from a particular property
to different property divided by (total number of amino acids − 1).
The shape of the output will be 1 x 3, with 3 features being generated
per sequence.
Parameters
==========
:sequence: str
protein sequence.
:property: str (default="hydrophocity")
physicochemical property name to use when calculating descriptor.
Returns
=======
:ctd_transition_df: pd.DataFrame
dataframe of calculated transition values for sequence using
selected physicochemical property. Output will be of shape 1 x 3,
with 3 features being generated per sequence.
"""
#validate input protein sequence
sequence = _validate_sequence(sequence)
#get closest matched CTD property - using cutoff of 0.8
property_matches = get_close_matches(property, ctd_properties.keys(), cutoff=0.8)
#raise error if no matching property found
if (property_matches != []):
prop = ctd_properties[property_matches[0]]
else:
raise ValueError(f"Invalid property '{property}'. Available properties are: {list(ctd_properties.keys())}.")
#convert sequence to number
seq = str_to_num(sequence, prop)
ctd_transition_ = {}
#calculate descriptor values, append to ctd_transition_ dict
ctd_transition_["CTD_T_12_" + prop["name"]] = round(
float(seq.count("12") + seq.count("21")) / (len(sequence)-1), 3)
ctd_transition_["CTD_T_13_" + prop["name"]] = round(
float(seq.count("13") + seq.count("31")) / (len(sequence)-1), 3)
ctd_transition_["CTD_T_23_" + prop["name"]] = round(
float(seq.count("23") + seq.count("32")) / (len(sequence)-1), 3)
#transform values and columns to DataFrame
ctd_transition_df = pd.DataFrame([list(ctd_transition_.values())], columns=list(ctd_transition_.keys()))
return ctd_transition_df
#################################### CTD Distribution #####################################
[docs]
def ctd_distribution(sequence: str, property: str = "hydrophobicity") -> pd.DataFrame:
"""
Calculate distribution physicochemical/structural descriptor. Distribution
is the chain length within which the first, 25%, 50%, 75% and 100% of the
amino acids of a particular property are located. The shape of the output
will be 1 x 15, with 15 features being generated per sequence.
Parameters
==========
:sequence: str
protein sequence.
:property: str (default="hydrophocity")
physicochemical property name to use when calculating descriptor.
Returns
=======
:ctd_distribution_df: pd.DataFrame
dataframe of calculated distribution values for sequence using
selected physicochemical property. Output will be of shape 1 x 15,
with 15 features being generated per sequence.
"""
#validate input protein sequence
sequence = _validate_sequence(sequence)
#get closest matched CTD property - using cutoff of 0.8
property_matches = get_close_matches(property, ctd_properties.keys(), cutoff=0.8)
#raise error if no matching property found
if (property_matches != []):
prop = ctd_properties[property_matches[0]]
else:
raise ValueError(f"Invalid property '{property}'. Available properties are: {list(ctd_properties.keys())}.")
#convert sequence to number
seq = str_to_num(sequence, prop)
ctd_distribution_ = {}
#iterate through sequence, calculating distribution descriptor values using property
for key, value in prop.items():
if (key=="name"):
continue
num = seq.count(key)
ink = 1
indexk = 0
cds = []
while ink <= num:
indexk = seq.find(key,indexk) + 1
cds.append(indexk)
ink = ink + 1
if cds == []:
ctd_distribution_["CTD_D_0" + key + "_001_" + prop["name"]] = 0
ctd_distribution_["CTD_D_0" + key + "_025_" + prop["name"]] = 0
ctd_distribution_["CTD_D_0" + key + "_050_" + prop["name"]] = 0
ctd_distribution_["CTD_D_0" + key + "_075_" + prop["name"]] = 0
ctd_distribution_["CTD_D_0" + key + "_100_" + prop["name"]] = 0
else:
ctd_distribution_["CTD_D_0" + key + "_001_" + prop["name"]] = round(
float(cds[0]) / len(seq) * 100, 3
)
ctd_distribution_["CTD_D_0" + key + "_025_" + prop["name"]] = round(
float(cds[int(math.floor(num * 0.25)) - 1]) / len(seq) * 100, 3
)
ctd_distribution_["CTD_D_0" + key + "_050_" + prop["name"]] = round(
float(cds[int(math.floor(num * 0.5)) - 1]) / len(seq) * 100, 3
)
ctd_distribution_["CTD_D_0" + key + "_075_" + prop["name"]] = round(
float(cds[int(math.floor(num * 0.75)) - 1]) / len(seq) * 100, 3
)
ctd_distribution_["CTD_D_0" + key + "_100_" + prop["name"]] = round(
float(cds[-1]) / len(seq) * 100, 3)
#transform values and columns to DataFrame
ctd_distribution_df = pd.DataFrame([list(ctd_distribution_.values())], columns=list(ctd_distribution_.keys()))
return ctd_distribution_df
[docs]
def ctd_(sequence: str, property: str = "hydrophobicity", all_ctd: bool = True) -> pd.DataFrame:
"""
Calculate all Composition, Transition and Distribution (CTD) features of protein sequences.
Composition is the number of amino acids of a particular property (e.g., hydrophobicity)
divided by the total number of amino acids in a protein sequence. Transition
characterizes the percent frequency with which amino acids of a particular
property is followed by amino acids of a different property. Distribution
measures the chain length within which the first, 25%, 50%, 75%, and 100% of
the amino acids of a particular property are located, respectively [1, 2].
CTD properties available are: Polarizability, Solvent Accessibility, Secondary
Structure, Charge, Polarity, Normalized VDWV, Hydrophobicity. Each property generates an
output of shape 1 x 21, 3/21 will be Composition, 3/21 will be Transition, 15/21 will be
Distribution. When Calculating all available features the generated output will be of shape
1 x 147, 21/147 will be composition, 21/147 will be transition and the remaining 105 are
distribution.
Parameters
==========
:sequence: str
protein sequence.
:property: str (default="hydrophocity")
physicochemical property name to use when calculating descriptor.
:ctd: bool
calculate all CTD descriptors and concatenate together.
Returns
=======
:ctd_df: pd.DataFrame
dataframe of CTD descriptor values for all protein sequences. DataFrame will
be of the shape 1 x 147, where 147 is the total number of features calculated from
the CTD descriptors per sequence, with each property generating an output of 1 x 21.
"""
#validate input protein sequence
sequence = _validate_sequence(sequence)
#initialise ctd dataframes
comp_df = pd.DataFrame()
trans_df = pd.DataFrame()
distr_df = pd.DataFrame()
#if using single property, calculate each of the CTD descriptors individually
if not all_ctd:
comp_df = ctd_composition(sequence, property=property)
trans_df = ctd_transition(sequence, property=property)
distr_df = ctd_distribution(sequence, property=property)
else:
#if using all calculable properties, calculate CTD descriptors for each property
for prop in ctd_properties:
comp = ctd_composition(sequence, property=prop)
comp_df = pd.concat([comp_df, comp], axis=1)
trans = ctd_transition(sequence, property=prop)
trans_df = pd.concat([trans_df, trans], axis=1)
distr = ctd_distribution(sequence, property=prop)
distr_df = pd.concat([distr_df, distr], axis=1)
#concatenate all descriptors
ctd = pd.concat([comp_df, trans_df, distr_df], axis=1)
return ctd