Source code for protpy.composition

################################################################################
#############                   Composition                       ##############
################################################################################

from __future__ import annotations

import re
import itertools
import pandas as pd
import math
from aaindex import aaindex1

"""
References
==========
[1] Gromiha, M. M. (2010). Protein Sequence Analysis. In M. M. Gromiha (Ed.), 
    Protein Bioinformatics (pp. 29–62). Elsevier.
[2] Hua, S. and Sun, Z. (2001) Support vector machine approach for protein
    subcellular localization prediction. Bioinformatics, 17, 721-728.
[3] Grassmann, J., Reczko, M., Suhai, S. and Edler, L. (1999) Protein fold
    class prediction: new methods of statistical classification. Proc Int Conf
    Intell Syst Mol Biol, 106-112.
[4] Kuo-Chen Chou. Prediction of Protein Cellular Attributes Using Pseudo-Amino Acid
    Composition. PROTEINS: Structure, Function, and Genetics, 2001, 43: 246-255.
[5] Kuo-Chen Chou, Using amphiphilic pseudo amino acid composition to predict enzyme 
    subfamily classes, Bioinformatics, Volume 21, Issue 1, January 2005, Pages 10–19, 
    https://doi.org/10.1093/bioinformatics/bth466
[6] http://www.csbio.sjtu.edu.cn/bioinf/PseAAC/ParaValue.htm.
[7] Reczko, M. and Bohr, H. (1994) The DEF data base of sequence based protein
    fold class predictions. Nucleic Acids Res, 22, 3616-3619.
[8] Kyte, J., & Doolittle, R. F. (1982). A simple method for displaying the
    hydropathic character of a protein. Journal of Molecular Biology, 157(1),
    105-132. https://doi.org/10.1016/0022-2836(82)90515-0
[9] Lobry, J. R., & Gautier, C. (1994). Hydrophobicity, expressivity and
    aromaticity are the major trends of amino-acid usage in 999 Escherichia coli
    chromosome-encoded genes. Nucleic Acids Research, 22(15), 3174-3180.
[10] Guruprasad, K., Reddy, B. V. B., & Pandit, M. W. (1990). Correlation
    between stability of a protein and its dipeptide composition: a novel
    approach for predicting in vivo stability of a protein from its primary
    sequence. Protein Engineering, 4(2), 155-161.
[11] Bjellqvist, B., et al. (1993). The focusing positions of polypeptides in
    immobilized pH gradients can be predicted from their amino acid sequences.
    Electrophoresis, 14(1), 1023-1031.
[12] Gasteiger, E., et al. (2005). Protein Identification and Analysis Tools on
    the ExPASy Server. In J. M. Walker (Ed.), The Proteomics Protocols Handbook
    (pp. 571-607). Humana Press.
[13] Cameselle-Teijeiro, J. (1979). The Henderson-Hasselbalch equation. 
    Biochemical Education, 7(3), 69-70.
[14] Chou, P. Y., & Fasman, G. D. (1974). Conformational parameters for amino
    acids in helical, beta-sheet, and random coil regions calculated from
    proteins. Biochemistry, 13(2), 211-222.
[15] Prosite database of protein families and domains:
    https://prosite.expasy.org/
[16] Ikai, A. J. (1980). Thermostability and aliphatic index of globular
    proteins. Journal of Biochemistry, 88(6), 1895-1898.
[17] Gasteiger, E., et al. (2005). Protein Identification and Analysis Tools
    on the ExPASy Server. In J. M. Walker (Ed.), The Proteomics Protocols
    Handbook (pp. 571-607). Humana Press. (extinction coefficient method)
[18] Boman, H. G. (2003). Antibacterial peptides: basic facts and emerging
    concepts. Journal of Internal Medicine, 254(3), 197-215.
[19] Eisenberg, D., Weiss, R. M., & Terwilliger, T. C. (1982). The helical
    hydrophobic moment: a measure of the amphiphilicity of a helix. Nature,
    299, 371-374. https://doi.org/10.1038/299371a0
"""
#list of amino acids
from ._constants import amino_acids, _validate_sequence

#physicochemical values for all amino acids for hydrophobicity, hydrophilicty and residue mass
#values taken from http://www.csbio.sjtu.edu.cn/bioinf/PseAAC/ParaValue.htm
#property values differ from their equivalent in the aaindex therefore hard-coding them below
hydrophobicity_ = {
        "A": 0.62, "C": 0.29, "D": -0.90, "E": -0.74, "F": 1.19, "G": 0.48, 
        "H": -0.40, "I": 1.38, "K": -1.50, "L": 1.06, "M": 0.64, "N": -0.78,
        "P": 0.12, "Q": -0.85, "R": -2.53, "S": -0.18, "T": -0.05, "V": 1.08, 
        "W": 0.81, "Y": 0.26
        }

hydrophilicity_ = {
        "A": -0.5, "C": -1.0, "D": 3.0, "E": 3.0, "F": -2.5,  "G": 0.0, 
        "H": -0.5, "I": -1.8, "K": 3.0, "L": -1.8, "M": -1.3, "N": 0.2, 
        "P": 0.0, "Q": 0.2, "R": 3.0, "S": 0.3, "T": -0.4, "V": -1.5, 
        "W": -3.4, "Y": -2.3
        }

residue_mass_ = {
        "A": 15.0, "C": 47.0, "D": 59.0, "E": 73.0, "F": 91.0, "G": 1.000, 
        "H": 82.0, "I": 57.0, "K": 73.0, "L": 57.0,  "M": 75.0, "N": 58.0, 
        "P": 42.0, "Q": 72.0, "R": 101.0, "S": 31.0, "T": 45.0, "V": 43.0, 
        "W": 130.0, "Y": 107.0
        }

################################# Amino Acid Composition ##################################

[docs] def amino_acid_composition(sequence: str) -> pd.DataFrame: """ Calculate Amino Acid Composition (AAComp) of protein sequence. AAComp describes the fraction of each amino acid type within a protein sequence, and is calculated as: AA_Comp(s) = AA(t)/N(s) where AA_Comp(s) is the AAComp of protein sequence s, AA(t) is the number of amino acid types t (where t = 1,2,..,20) and N(s) is the length of the sequences [1]. Parameters ========== :sequence: str protein sequence. Returns ======= :amino_acid_composition_df: pd.DataFrame pandas dataframe of AAComp for protein sequence. Dataframe will be of the shape 1 x 20, where 20 is the number of features calculated from the descriptor (for the 20 amino acids) and 1 is the input sequence. """ #validate input protein sequence sequence = _validate_sequence(sequence) amino_acid_composition = {} #iterate through each amino acid, calculating each composition for aa in amino_acids: amino_acid_composition[aa] = round(float(sequence.count(aa)) / len(sequence) * 100, 3) #transform values and columns to DataFrame amino_acid_composition_df = pd.DataFrame([list(amino_acid_composition.values())], columns=list(amino_acid_composition.keys())) return amino_acid_composition_df
################################## Dipeptide Composition ##################################
[docs] def dipeptide_composition(sequence: str) -> pd.DataFrame: """ Calculate Dipeptide Composition (DPComp) for protein sequence. Dipeptide composition is the fraction of each dipeptide type within a protein sequence. With dipeptides being of length 2 and there being 20 canonical amino acids this creates 20^2 different combinations, thus a 400-Dimensional vector will be produced such that: DPComp(s,t) = AA(s,t) / N -1 where DPComp(s,t) is the dipeptide composition of the protein sequence for amino acid type s and t (where s and t = 1,2,..,20), AA(s,t) is the number of dipeptides represented by amino acid type s and t and N is the total number of dipeptides [1]. Parameters ========== :sequence: str protein sequence. Returns ======= :dipeptide_composition_df: pd.DataFrame pandas dataframe of dipeptide composition for protein sequence. Dataframe will be of the shape 1 x 400, where 400 is the number of features calculated from the descriptor (20^2 for the 20 canonical amino acids) and 1 is the input sequence. """ #validate input protein sequence sequence = _validate_sequence(sequence) dipeptide_composition = {} #iterate through each amino acid, calculating each dipeptide combination's composition for i in amino_acids: for j in amino_acids: dipep = i + j dipeptide_composition[dipep] = round( float(sequence.count(dipep)) / (len(sequence)-1) * 100, 2) #transform values and columns to DataFrame dipeptide_composition_df = pd.DataFrame([list(dipeptide_composition.values())], columns=list(dipeptide_composition.keys())) return dipeptide_composition_df
################################## Tripeptide Composition #################################
[docs] def tripeptide_composition(sequence: str) -> pd.DataFrame: """ Calculate Tripeptide Composition (TPComp) of protein sequence. Tripeptide composition is the fraction of each tripeptide type within a protein sequence. With tripeptides being of length 3 and there being 20 canonical amino acids this creates 20^3 different combinations, thus a 8000-Dimensional vector will be produced such that: TPComp(s,t,u) = AA(s,t,u) / N - 1 where TPComp(s,t,u) is the tripeptide composition of the protein sequence for amino acid type s, t and u (where s, t and u = 1,2,..,20), AA(s,t,u) is the number of tripeptides represented by amino acid type s and t, and N is the total number of tripeptides [1]. Parameters ========== :sequence: str protein sequence in str form. Returns ======= :tripeptide_composition_df: pd.DataFrame pandas DataFrame of tripeptide composition for protein sequence. Dataframe will be of the shape 1 x 8000, where 8000 is the number of features calculated from the descriptor (20^3 for the 20 canonical amino acids) and 1 is the input sequence. """ #validate input protein sequence sequence = _validate_sequence(sequence) tripeptide_composition = {} tripeptides = [] #get list of tripeptides for i in amino_acids: for j in amino_acids: for k in amino_acids: tripeptides.append(i + j + k) #get frequency of each tripeptide in the sequence for i in tripeptides: tripeptide_composition[i] = len(re.findall(i, sequence)) #transform values and columns to DataFrame tripeptide_composition_df = pd.DataFrame([list(tripeptide_composition.values())], columns=list(tripeptide_composition.keys())) return tripeptide_composition_df
################################## GRAVY ################################################## #Kyte-Doolittle hydropathy scale for all 20 standard amino acids [8] kyte_doolittle_ = { "A": 1.8, "C": 2.5, "D": -3.5, "E": -3.5, "F": 2.8, "G": -0.4, "H": -3.2, "I": 4.5, "K": -3.9, "L": 3.8, "M": 1.9, "N": -3.5, "P": -1.6, "Q": -3.5, "R": -4.5, "S": -0.8, "T": -0.7, "V": 4.2, "W": -0.9, "Y": -1.3 }
[docs] def gravy(sequence: str) -> pd.DataFrame: """ Calculate the Grand Average of Hydropathy (GRAVY) for a protein sequence. GRAVY is the sum of the Kyte-Doolittle hydropathy values of all residues divided by the sequence length [8]: GRAVY = (sum of hydropathy values) / N A positive GRAVY value indicates an overall hydrophobic protein; a negative value indicates an overall hydrophilic protein. Parameters ========== :sequence: str protein sequence. Returns ======= :gravy_df: pd.DataFrame pandas DataFrame containing the GRAVY score for the input sequence. Dataframe will be of the shape 1 x 1. """ #validate input protein sequence sequence = _validate_sequence(sequence) #sum Kyte-Doolittle hydropathy values and divide by sequence length gravy_score = round(sum(kyte_doolittle_[aa] for aa in sequence) / len(sequence), 3) #transform to DataFrame gravy_df = pd.DataFrame([[gravy_score]], columns=["GRAVY"]) return gravy_df
################################## Aromaticity ############################################
[docs] def aromaticity(sequence: str) -> pd.DataFrame: """ Calculate Aromaticity of a protein sequence. Aromaticity is the fraction of aromatic amino acids (Phe, Trp, Tyr) in the sequence [9]: Aromaticity = (F + W + Y) / N where F, W and Y are the counts of Phe, Trp and Tyr residues, and N is the total sequence length. Parameters ========== :sequence: str protein sequence. Returns ======= :aromaticity_df: pd.DataFrame pandas DataFrame of shape 1 x 1 containing the aromaticity score. """ #validate input protein sequence sequence = _validate_sequence(sequence) #count aromatic residues (F, W, Y) and divide by sequence length aromatic_count = sum(sequence.count(aa) for aa in ("F", "W", "Y")) aromaticity_score = round(aromatic_count / len(sequence), 3) #transform to DataFrame aromaticity_df = pd.DataFrame([[aromaticity_score]], columns=["Aromaticity"]) return aromaticity_df
############################### Instability Index ######################################### #DIWV (dipeptide instability weight values) from Guruprasad et al. (1990) [10] #missing dipeptide pairs default to 1.0 (neutral contribution) diwv_ = { "WW": 1.0, "WC": 1.0, "WM": 24.68, "WH": 24.68, "WY": 1.0, "WF": 1.0, "WQ": 1.0, "WN": 13.34,"WK": 1.0, "WD": 1.0, "WT": -14.03,"WS": 1.0, "WR": 1.0, "WV": -7.49, "WP": 1.0, "WA": -14.03,"WI": 1.0, "WL": 13.34, "WG": -9.37,"WE": 1.0, "CW": 24.68,"CC": -6.54, "CM": 33.6, "CH": 33.6, "CY": 1.0, "CF": 1.0, "CQ": -6.54, "CN": 1.0, "CK": 1.0, "CD": 20.26, "CT": 33.6, "CS": 1.0, "CR": 1.0, "CV": -6.54, "CP": 20.26,"CA": 1.0, "CI": 1.0, "CL": 20.26, "CG": 1.0, "CE": 1.0, "MW": 1.0, "MC": 1.0, "MM": -1.88, "MH": 58.28, "MY": 24.68,"MF": 1.0, "MQ": -6.54, "MN": 1.0, "MK": 1.0, "MD": 1.0, "MT": -1.88, "MS": 44.94,"MR": -6.54,"MV": 1.0, "MP": 44.94,"MA": 13.34,"MI": 1.0, "ML": 1.0, "MG": 1.0, "ME": 1.0, "HW": -1.88,"HC": 1.0, "HM": 1.0, "HH": -1.88, "HY": 44.94,"HF": -9.37,"HQ": 1.0, "HN": 24.68,"HK": 24.68,"HD": 1.0, "HT": -6.54, "HS": -6.54,"HR": 1.0, "HV": 1.0, "HP": -1.88,"HA": 1.0, "HI": 44.94, "HL": 1.0, "HG": -9.37,"HE": 1.0, "YW": -9.37,"YC": 1.0, "YM": 44.94, "YH": 13.34, "YY": 13.34,"YF": 1.0, "YQ": 1.0, "YN": 1.0, "YK": -9.37,"YD": 24.68, "YT": -7.49, "YS": 1.0, "YR": -15.91,"YV": 1.0, "YP": 13.34,"YA": 24.68,"YI": 1.0, "YL": 1.0, "YG": -7.49,"YE": -6.54, "FW": 1.0, "FC": 1.0, "FM": 1.0, "FH": 1.0, "FY": 33.6, "FF": 1.0, "FQ": 1.0, "FN": 1.0, "FK": -14.03,"FD": 13.34,"FT": 1.0, "FS": 1.0, "FR": 1.0, "FV": 1.0, "FP": 20.26,"FA": 1.0, "FI": 1.0, "FL": 1.0, "FG": 1.0, "FE": 1.0, "QW": 1.0, "QC": -6.54,"QM": 1.0, "QH": 1.0, "QY": -6.54,"QF": -6.54,"QQ": 20.26, "QN": 1.0, "QK": 1.0, "QD": 20.26, "QT": 1.0, "QS": 44.94,"QR": 1.0, "QV": -6.54, "QP": 20.26,"QA": 1.0, "QI": 1.0, "QL": 1.0, "QG": 1.0, "QE": 20.26, "NW": -9.37,"NC": -1.88,"NM": 1.0, "NH": 1.0, "NY": 1.0, "NF": -14.03,"NQ": -6.54, "NN": 1.0, "NK": 24.68,"ND": 1.0, "NT": -7.49, "NS": 1.0, "NR": 1.0, "NV": 1.0, "NP": -1.88,"NA": 1.0, "NI": 44.94, "NL": 1.0, "NG": -14.03,"NE": 1.0, "KW": 1.0, "KC": 1.0, "KM": 33.6, "KH": -1.88, "KY": 1.0, "KF": 1.0, "KQ": 24.68, "KN": 1.0, "KK": 1.0, "KD": 1.0, "KT": 1.0, "KS": 1.0, "KR": 33.6, "KV": -7.49, "KP": -6.54,"KA": 1.0, "KI": -7.49, "KL": -7.49, "KG": -7.49,"KE": 1.0, "DW": 1.0, "DC": 1.0, "DM": 1.0, "DH": 1.0, "DY": 1.0, "DF": 1.0, "DQ": 1.0, "DN": 42.67,"DK": -7.49,"DD": 1.0, "DT": 1.0, "DS": 1.0, "DR": -6.54,"DV": 1.0, "DP": 1.0, "DA": 1.0, "DI": 1.0, "DL": 1.0, "DG": 1.0, "DE": 1.0, "TW": -14.03,"TC": 1.0, "TM": 1.0, "TH": 1.0, "TY": 1.0, "TF": 13.34,"TQ": -6.54, "TN": -14.03,"TK": 1.0, "TD": 1.0, "TT": 1.0, "TS": 1.0, "TR": 1.0, "TV": 1.0, "TP": 1.0, "TA": 13.34,"TI": 1.0, "TL": 1.0, "TG": -7.49,"TE": 20.26, "SW": 1.0, "SC": 33.6, "SM": 1.0, "SH": 1.0, "SY": 1.0, "SF": 1.0, "SQ": 20.26, "SN": 1.0, "SK": 1.0, "SD": 1.0, "ST": 1.0, "SS": 20.26,"SR": 20.26,"SV": 1.0, "SP": 44.94,"SA": 20.26,"SI": 1.0, "SL": 1.0, "SG": 1.0, "SE": 20.26, "RW": 58.28,"RC": 1.0, "RM": 1.0, "RH": 20.26, "RY": -6.54,"RF": 1.0, "RQ": 20.26, "RN": 13.34,"RK": 1.0, "RD": -6.54, "RT": 1.0, "RS": 44.94,"RR": 58.28,"RV": 1.0, "RP": 20.26,"RA": 1.0, "RI": 1.0, "RL": 1.0, "RG": -7.49,"RE": 1.0, "VW": 1.0, "VC": 1.0, "VM": 1.0, "VH": 1.0, "VY": -6.54,"VF": 1.0, "VQ": 1.0, "VN": 1.0, "VK": -7.49,"VD": -14.03,"VT": -7.49, "VS": 1.0, "VR": 1.0, "VV": 1.0, "VP": 20.26,"VA": 1.0, "VI": 1.0, "VL": 1.0, "VG": -7.49,"VE": 1.0, "PW": -6.54,"PC": -6.54,"PM": -6.54, "PH": 1.0, "PY": 1.0, "PF": 20.26,"PQ": 20.26, "PN": 1.0, "PK": 1.0, "PD": -6.54, "PT": 1.0, "PS": 20.26,"PR": -6.54,"PV": 20.26, "PP": 20.26,"PA": 20.26,"PI": 1.0, "PL": 1.0, "PG": 1.0, "PE": 18.38, "AW": -14.03,"AC": 44.94,"AM": 1.0, "AH": -7.49, "AY": 1.0, "AF": 1.0, "AQ": 1.0, "AN": 1.0, "AK": 1.0, "AD": -7.49, "AT": 1.0, "AS": 1.0, "AR": 1.0, "AV": 1.0, "AP": 20.26,"AA": 1.0, "AI": 1.0, "AL": 1.0, "AG": 1.0, "AE": 1.0, "IW": 1.0, "IC": 1.0, "IM": 1.0, "IH": 13.34, "IY": 1.0, "IF": 1.0, "IQ": 1.0, "IN": 1.0, "IK": -7.49,"ID": 1.0, "IT": 1.0, "IS": 1.0, "IR": 1.0, "IV": -7.49, "IP": 1.0, "IA": 1.0, "II": 1.0, "IL": 1.0, "IG": 1.0, "IE": 44.94, "LW": 24.68,"LC": 1.0, "LM": 1.0, "LH": 1.0, "LY": 1.0, "LF": 1.0, "LQ": 33.6, "LN": 1.0, "LK": -7.49,"LD": 1.0, "LT": 1.0, "LS": 1.0, "LR": 20.26,"LV": 1.0, "LP": 20.26,"LA": 1.0, "LI": 1.0, "LL": 1.0, "LG": 1.0, "LE": 1.0, "GW": 13.34,"GC": 1.0, "GM": 1.0, "GH": 1.0, "GY": -7.49,"GF": 1.0, "GQ": 1.0, "GN": -7.49,"GK": -7.49,"GD": 1.0, "GT": -7.49, "GS": 1.0, "GR": 1.0, "GV": 1.0, "GP": 1.0, "GA": 1.0, "GI": 1.0, "GL": 1.0, "GG": 13.34,"GE": -6.54, "EW": -14.03,"EC": 44.94,"EM": 1.0, "EH": -6.54, "EY": 1.0, "EF": 1.0, "EQ": 20.26, "EN": 1.0, "EK": 1.0, "ED": 1.0, "ET": -6.54, "ES": 20.26,"ER": 1.0, "EV": 1.0, "EP": 20.26,"EA": 1.0, "EI": 1.0, "EL": 1.0, "EG": 1.0, "EE": 20.26 }
[docs] def instability_index(sequence: str) -> pd.DataFrame: """ Calculate the Instability Index (II) of a protein sequence. The instability index is a measure of in vivo stability, computed as a weighted sum of dipeptide instability weight values (DIWV) [10]: II = (10 / N) * sum(DIWV(x_i, x_{i+1})) for i in 1..N-1 Proteins with II < 40 are predicted to be stable; II >= 40 indicates instability. Parameters ========== :sequence: str protein sequence. Returns ======= :instability_index_df: pd.DataFrame pandas DataFrame of shape 1 x 1 containing the instability index. """ #validate input protein sequence sequence = _validate_sequence(sequence) #sum DIWV values for all consecutive dipeptides; default to 1.0 for unlisted pairs dipeptide_sum = sum(diwv_.get(sequence[i:i+2], 1.0) for i in range(len(sequence) - 1)) ii = round((dipeptide_sum * 10.0) / len(sequence), 3) #transform to DataFrame instability_index_df = pd.DataFrame([[ii]], columns=["InstabilityIndex"]) return instability_index_df
################################# Isoelectric Point ###################################### #pKa values for ionisable groups used in the isoelectric point calculation [11] pka_values_ = { "D": 3.9, "E": 4.1, "H": 6.0, "C": 8.3, "Y": 10.1, "K": 10.5, "R": 12.5, "N_term": 8.0, "C_term": 3.1 }
[docs] def isoelectric_point(sequence: str) -> pd.DataFrame: """ Calculate the theoretical Isoelectric Point (pI) of a protein sequence. pI is the pH at which the net charge of the protein is zero. It is computed by iteratively adjusting pH until positive and negative charges balance [11]. Parameters ========== :sequence: str protein sequence. Returns ======= :isoelectric_point_df: pd.DataFrame pandas DataFrame of shape 1 x 1 containing the pI value. """ #validate input protein sequence sequence = _validate_sequence(sequence) #count ionisable residues counts = {aa: sequence.count(aa) for aa in "DEHCYKR"} #iterative charge-balance: adjust pH by +/- 0.001 until net charge < 1e-4 ph = 7.0 for _ in range(5000): pos = (counts["K"] / (1 + 10 ** (ph - pka_values_["K"])) + counts["R"] / (1 + 10 ** (ph - pka_values_["R"])) + counts["H"] / (1 + 10 ** (ph - pka_values_["H"])) + 1.0 / (1 + 10 ** (ph - pka_values_["N_term"]))) neg = (counts["D"] / (1 + 10 ** (pka_values_["D"] - ph)) + counts["E"] / (1 + 10 ** (pka_values_["E"] - ph)) + counts["C"] / (1 + 10 ** (pka_values_["C"] - ph)) + counts["Y"] / (1 + 10 ** (pka_values_["Y"] - ph)) + 1.0 / (1 + 10 ** (pka_values_["C_term"] - ph))) net_charge = pos - neg if abs(net_charge) < 1e-4: break ph += 0.001 if net_charge > 0 else -0.001 #transform to DataFrame isoelectric_point_df = pd.DataFrame([[round(ph, 3)]], columns=["IsoelectricPoint"]) return isoelectric_point_df
############################### Molecular Weight ########################################## #average residue molecular weights (Da) for each of the 20 standard amino acids [12] residue_molecular_weight_ = { "A": 89.094, "C": 121.154, "D": 133.104, "E": 147.130, "F": 165.192, "G": 75.032, "H": 155.156, "I": 131.174, "K": 146.189, "L": 131.174, "M": 149.208, "N": 132.119, "P": 115.132, "Q": 146.146, "R": 174.203, "S": 105.093, "T": 119.119, "V": 117.148, "W": 204.228, "Y": 181.191 }
[docs] def molecular_weight(sequence: str) -> pd.DataFrame: """ Calculate the Molecular Weight (MW) of a protein sequence. MW is computed as the sum of average residue masses minus one water molecule per peptide bond (18.015 Da) [12]: MW = sum(residue_mass_i) - (N - 1) * 18.015 Parameters ========== :sequence: str protein sequence. Returns ======= :molecular_weight_df: pd.DataFrame pandas DataFrame of shape 1 x 1 containing the molecular weight (Da). """ #validate input protein sequence sequence = _validate_sequence(sequence) #sum residue masses and subtract one water per peptide bond mw = sum(residue_molecular_weight_[aa] for aa in sequence) - (len(sequence) - 1) * 18.015 molecular_weight_df = pd.DataFrame([[round(mw, 3)]], columns=["MolecularWeight"]) return molecular_weight_df
############################## Charge Distribution ########################################
[docs] def charge_distribution(sequence: str, ph: float = 7.4) -> pd.DataFrame: """ Calculate Charge Distribution of a protein sequence at a given pH. Positive charge is contributed by Lys (K), Arg (R) and His (H); negative charge by Asp (D) and Glu (E). Henderson-Hasselbalch equations are used [13]: positive = sum(count(aa) / (1 + 10^(pH - pKa))) for aa in {K, R, H} negative = sum(count(aa) / (1 + 10^(pKa - pH))) for aa in {D, E} net = positive - negative Parameters ========== :sequence: str protein sequence. :ph: float (default=7.4) pH at which to evaluate charges. Returns ======= :charge_distribution_df: pd.DataFrame pandas DataFrame of shape 1 x 3 with columns ["PositiveCharge", "NegativeCharge", "NetCharge"]. """ #validate input protein sequence sequence = _validate_sequence(sequence) #pKa values for charged residues pka_charged_ = {"K": 10.5, "R": 12.5, "H": 6.0, "D": 3.9, "E": 4.1} #compute positive charge (K, R, H) using Henderson-Hasselbalch pos = sum(sequence.count(aa) / (1 + 10 ** (ph - pka_charged_[aa])) for aa in ("K", "R", "H")) #compute negative charge (D, E) neg = sum(sequence.count(aa) / (1 + 10 ** (pka_charged_[aa] - ph)) for aa in ("D", "E")) charge_distribution_df = pd.DataFrame( [[round(pos, 3), round(neg, 3), round(pos - neg, 3)]], columns=["PositiveCharge", "NegativeCharge", "NetCharge"] ) return charge_distribution_df
####################### Hydrophobic/Polar/Charged Composition ############################ #residue class membership for hydrophobic, polar and charged groupings hydrophobic_residues_ = frozenset("ACFILMVWY") polar_residues_ = frozenset("GNQST") charged_residues_ = frozenset("DERHK")
[docs] def hydrophobic_polar_charged_composition(sequence: str) -> pd.DataFrame: """ Calculate Hydrophobic, Polar and Charged Composition of a protein sequence. Residues are grouped into three physicochemical classes [1]: - Hydrophobic (nonpolar): A, C, F, I, L, M, V, W, Y - Polar (uncharged): G, N, Q, S, T - Charged: D, E, R, H, K Each value is expressed as a percentage of the total sequence length. Parameters ========== :sequence: str protein sequence. Returns ======= :hpc_df: pd.DataFrame pandas DataFrame of shape 1 x 3 with columns ["Hydrophobic", "Polar", "Charged"]. """ #validate input protein sequence sequence = _validate_sequence(sequence) n = len(sequence) #compute fraction of each class as percentage h = round(sum(1 for aa in sequence if aa in hydrophobic_residues_) / n * 100, 3) p = round(sum(1 for aa in sequence if aa in polar_residues_) / n * 100, 3) c = round(sum(1 for aa in sequence if aa in charged_residues_) / n * 100, 3) hpc_df = pd.DataFrame([[h, p, c]], columns=["Hydrophobic", "Polar", "Charged"]) return hpc_df
########################## Secondary Structure Propensity ################################ #Chou-Fasman propensity values for alpha-helix, beta-sheet and coil [14] helix_propensity_ = { "A": 1.45, "C": 0.77, "D": 0.98, "E": 1.53, "F": 1.12, "G": 0.53, "H": 1.24, "I": 1.00, "K": 1.07, "L": 1.34, "M": 1.20, "N": 0.73, "P": 0.59, "Q": 1.17, "R": 0.79, "S": 0.79, "T": 0.82, "V": 1.14, "W": 1.14, "Y": 0.61 } sheet_propensity_ = { "A": 0.97, "C": 1.30, "D": 0.80, "E": 0.26, "F": 1.28, "G": 0.81, "H": 0.71, "I": 1.60, "K": 0.74, "L": 1.22, "M": 1.67, "N": 0.65, "P": 0.62, "Q": 1.23, "R": 0.90, "S": 0.72, "T": 1.20, "V": 1.65, "W": 1.19, "Y": 1.29 } coil_propensity_ = { "A": 0.72, "C": 1.17, "D": 1.41, "E": 0.80, "F": 0.58, "G": 1.64, "H": 1.00, "I": 0.51, "K": 1.01, "L": 0.57, "M": 0.67, "N": 1.68, "P": 1.91, "Q": 0.98, "R": 1.24, "S": 1.33, "T": 1.03, "V": 0.50, "W": 0.99, "Y": 1.29 }
[docs] def secondary_structure_propensity(sequence: str) -> pd.DataFrame: """ Calculate Secondary Structure Propensity of a protein sequence. Each residue is assigned Chou-Fasman propensity values for alpha-helix (H), beta-sheet (E) and coil/loop (C) conformations [14]. The mean propensity for each secondary structure class across the whole sequence is returned. Parameters ========== :sequence: str protein sequence. Returns ======= :ssp_df: pd.DataFrame pandas DataFrame of shape 1 x 3 with columns ["Helix", "Sheet", "Coil"]. """ #validate input protein sequence sequence = _validate_sequence(sequence) n = len(sequence) #average propensity per secondary structure class h = round(sum(helix_propensity_[aa] for aa in sequence) / n, 3) e = round(sum(sheet_propensity_[aa] for aa in sequence) / n, 3) c = round(sum(coil_propensity_[aa] for aa in sequence) / n, 3) ssp_df = pd.DataFrame([[h, e, c]], columns=["Helix", "Sheet", "Coil"]) return ssp_df
################################# k-mer Composition ######################################
[docs] def kmer_composition(sequence: str, k: int = 2) -> pd.DataFrame: """ Calculate k-mer Composition of a protein sequence. A k-mer is a subsequence of length k. For each of the 20^k possible k-mers the frequency relative to the total number of k-mers in the sequence is calculated [1]. Parameters ========== :sequence: str protein sequence. :k: int (default=2) length of k-mer subsequences (k >= 1). Returns ======= :kmer_composition_df: pd.DataFrame pandas DataFrame of shape 1 x 20^k containing fractional k-mer frequencies. """ #validate input protein sequence sequence = _validate_sequence(sequence) #validate k; default to 2 if invalid; raise error for impractically large k if not isinstance(k, int) or k < 1: k = 2 if k > 4: raise ValueError( f"k={k} would generate {20**k:,} columns (20^k). Maximum supported k is 4." ) #generate all possible k-mers from the 20 canonical amino acids all_kmers = ["".join(p) for p in itertools.product(amino_acids, repeat=k)] total = len(sequence) - k + 1 kmer_comp = {} #count each k-mer occurrence, divide by total k-mer count for kmer in all_kmers: cnt = sum(1 for i in range(total) if sequence[i:i+k] == kmer) kmer_comp[kmer] = round(cnt / total * 100, 3) if total > 0 else 0.0 kmer_composition_df = pd.DataFrame([list(kmer_comp.values())], columns=list(kmer_comp.keys())) return kmer_composition_df
########################## Reduced Alphabet Composition ################################## #predefined reduced alphabets mapping 20 amino acids to fewer groups #key = alphabet size, value = list of frozensets (each frozenset = one group) reduced_alphabets_ = { 2: [frozenset("ACFILMVWY"), frozenset("DEGNHKPQRST")], 3: [frozenset("ACFILMVWY"), frozenset("GNQST"), frozenset("DERHKP")], 4: [frozenset("ACFILMVW"), frozenset("GNQSTY"), frozenset("DERHK"), frozenset("P")], 6: [ frozenset("ACST"), # small polar/aliphatic frozenset("FILMVWY"),# hydrophobic/aromatic frozenset("DE"), # acidic frozenset("KRH"), # basic frozenset("NQ"), # amide frozenset("GP"), # special ] }
[docs] def reduced_alphabet_composition(sequence: str, alphabet_size: int = 6) -> pd.DataFrame: """ Calculate Reduced Alphabet Composition of a protein sequence. The 20 standard amino acids are clustered into a smaller alphabet of physicochemically similar groups. The fraction of residues belonging to each group is returned [1]. Supported alphabet sizes: 2, 3, 4, 6 (default=6). Invalid sizes are reset to 6. Parameters ========== :sequence: str protein sequence. :alphabet_size: int (default=6) number of amino acid groups (2, 3, 4 or 6). Returns ======= :reduced_alphabet_df: pd.DataFrame pandas DataFrame of shape 1 x alphabet_size. """ #validate input protein sequence sequence = _validate_sequence(sequence) #default to 6 if an unsupported size is requested if alphabet_size not in reduced_alphabets_: alphabet_size = 6 groups = reduced_alphabets_[alphabet_size] n = len(sequence) values = [] col_names = [] #compute fractional composition for each group for idx, group in enumerate(groups): freq = round(sum(1 for aa in sequence if aa in group) / n * 100, 3) values.append(freq) col_names.append(f"ReducedAlphabet_{idx + 1}") reduced_alphabet_df = pd.DataFrame([values], columns=col_names) return reduced_alphabet_df
############################## Motif Composition ########################################## #default set of biologically relevant motifs with regex patterns [15] default_motifs_ = { "NxS/T_glycosylation": r"N[^P][ST]", "RGD_integrin": r"RGD", "KDEL_retention": r"KDEL", "CxxC_zinc_finger": r"C.{2}C", "CAAX_prenylation": r"C[ACFILMVWY]{2}[ACEILMQV]$", "cAMP_PKA": r"RR.S", "dileucine_sorting": r"[DE]xxxL[LI]$", "PEST_region": r"P.{1,10}[ESD].{1,10}T" }
[docs] def motif_composition(sequence: str, motifs: dict[str, str] | None = None) -> pd.DataFrame: """ Calculate Motif Composition of a protein sequence. For each motif in the provided dictionary, the number of regex matches within the sequence is counted [15]. If no motifs are supplied, a set of biologically relevant default motifs is used. Parameters ========== :sequence: str protein sequence. :motifs: dict or None (default=None) dictionary mapping motif names (str) to regex patterns (str). If None, the built-in default_motifs dict is used. Returns ======= :motif_composition_df: pd.DataFrame pandas DataFrame of shape 1 x M, where M is the number of motifs. Each value is the integer count of matches for that motif. """ #validate input protein sequence sequence = _validate_sequence(sequence) #use default motifs if none provided if motifs is None: motifs = default_motifs_ motif_counts = {} #count overlapping matches for each motif pattern for name, pattern in motifs.items(): motif_counts[name] = len(re.findall(r'(?=' + pattern + r')', sequence)) motif_composition_df = pd.DataFrame([list(motif_counts.values())], columns=list(motif_counts.keys())) return motif_composition_df
######################### Amino Acid Pair Composition #####################################
[docs] def amino_acid_pair_composition(sequence: str) -> pd.DataFrame: """ Calculate Amino Acid Pair Composition (PairComp) of a protein sequence. For each pair of consecutive residues (i, i+1), the fractional frequency of all 400 possible dipeptide types (20 x 20) is computed [1]: PairComp(s,t) = count(s,t) / (N - 1) * 100 Unlike the standard dipeptide composition, results are annotated with the physicochemical class of each residue in the pair (Hydrophobic/Polar/Charged). Parameters ========== :sequence: str protein sequence. Returns ======= :pair_composition_df: pd.DataFrame pandas DataFrame of shape 1 x 400, where each column label has the form 'XY_Class1-Class2' (e.g. 'AL_Hydrophobic-Hydrophobic'). """ #validate input protein sequence sequence = _validate_sequence(sequence) #map each amino acid to its physicochemical class def _aa_class(aa): if aa in hydrophobic_residues_: return "Hydrophobic" elif aa in polar_residues_: return "Polar" return "Charged" total = len(sequence) - 1 pair_comp = {} #build the 400 annotated columns and compute frequencies for i in amino_acids: for j in amino_acids: label = f"{i}{j}_{_aa_class(i)}-{_aa_class(j)}" count = sum(1 for k in range(total) if sequence[k] == i and sequence[k+1] == j) pair_comp[label] = round(count / total * 100, 3) if total > 0 else 0.0 pair_composition_df = pd.DataFrame([list(pair_comp.values())], columns=list(pair_comp.keys())) return pair_composition_df
################################## Aliphatic Index ########################################
[docs] def aliphatic_index(sequence: str) -> pd.DataFrame: """ Calculate the Aliphatic Index (AI) of a protein sequence. The aliphatic index is a measure of thermostability, defined as the relative volume occupied by aliphatic side chains (Ala, Val, Ile, Leu) [16]: AI = Xala + 2.9 * Xval + 3.9 * (Xile + Xleu) where X values are mole percentages of each residue. Higher values indicate greater thermostability. Parameters ========== :sequence: str protein sequence. Returns ======= :aliphatic_index_df: pd.DataFrame pandas DataFrame of shape 1 x 1 containing the aliphatic index score. """ #validate input protein sequence sequence = _validate_sequence(sequence) n = len(sequence) #compute mole percentages for aliphatic residues xa = sequence.count("A") / n * 100 xv = sequence.count("V") / n * 100 xi = sequence.count("I") / n * 100 xl = sequence.count("L") / n * 100 #apply Ikai (1980) aliphatic index formula ai = round(xa + 2.9 * xv + 3.9 * (xi + xl), 3) aliphatic_index_df = pd.DataFrame([[ai]], columns=["AliphaticIndex"]) return aliphatic_index_df
############################### Extinction Coefficient ####################################
[docs] def extinction_coefficient(sequence: str) -> pd.DataFrame: """ Calculate the molar Extinction Coefficient of a protein sequence at 280 nm. The coefficient is derived from the number of Trp (W), Tyr (Y) and Cys (C) residues using the method of Gasteiger et al. (2005) [17]: EC = (W * 5500) + (Y * 1490) + (SS * 125) where SS is the number of disulfide bonds (Cys_count // 2) for the oxidised form, and 0 for the reduced form. Both are returned. Parameters ========== :sequence: str protein sequence. Returns ======= :extinction_coefficient_df: pd.DataFrame pandas DataFrame of shape 1 x 2 with columns ``ExtCoeff_Reduced`` and ``ExtCoeff_Oxidized`` (M⁻¹cm⁻¹). """ #validate input protein sequence sequence = _validate_sequence(sequence) w = sequence.count("W") y = sequence.count("Y") c = sequence.count("C") #reduced: no disulfide bonds ec_reduced = w * 5500 + y * 1490 #oxidized: each pair of Cys contributes one disulfide bond (125 M⁻¹cm⁻¹) ec_oxidized = ec_reduced + (c // 2) * 125 extinction_coefficient_df = pd.DataFrame( [[ec_reduced, ec_oxidized]], columns=["ExtCoeff_Reduced", "ExtCoeff_Oxidized"] ) return extinction_coefficient_df
################################ Boman Index ############################################## #solubility scale from Boman (2003) [18] boman_scale_ = { "A": 2.1, "C": -0.7, "D": -3.5, "E": -3.5, "F": 2.8, "G": 2.1, "H": -3.2, "I": 4.5, "K": -3.9, "L": 4.5, "M": 1.9, "N": -4.8, "P": -1.6, "Q": -3.5, "R": -0.8, "S": -0.8, "T": -0.7, "V": 4.2, "W": -0.9, "Y": -1.3 }
[docs] def boman_index(sequence: str) -> pd.DataFrame: """ Calculate the Boman Index (potential protein interaction index) for a protein sequence. The index is the sum of the solubility values of all residues divided by the sequence length [18]: BI = sum(solubility(aa_i)) / N Higher values indicate a greater tendency to bind other proteins or cell membranes (e.g. antimicrobial peptides). Values >2.48 suggest high interaction potential. Parameters ========== :sequence: str protein sequence. Returns ======= :boman_index_df: pd.DataFrame pandas DataFrame of shape 1 x 1 containing the Boman Index score. """ #validate input protein sequence sequence = _validate_sequence(sequence) #sum solubility values and divide by sequence length bi = round(sum(boman_scale_[aa] for aa in sequence) / len(sequence), 3) boman_index_df = pd.DataFrame([[bi]], columns=["BomanIndex"]) return boman_index_df
############################### Aggregation Propensity ####################################
[docs] def aggregation_propensity(sequence: str, window: int = 5, hydrophobicity_threshold: float = 2.0, charge_threshold: int = 1) -> pd.DataFrame: """ Calculate the Aggregation Propensity of a protein sequence. Aggregation- prone regions (APRs) are identified using a sliding window: a window is classified as an APR when its mean Kyte-Doolittle hydrophobicity exceeds ``hydrophobicity_threshold`` and the number of charged residues (D, E, K, R) is below ``charge_threshold`` [8]. Parameters ========== :sequence: str protein sequence. :window: int (default=5) sliding window length in residues. :hydrophobicity_threshold: float (default=2.0) minimum mean Kyte-Doolittle hydrophobicity for an APR window. :charge_threshold: int (default=1) maximum number of charged residues (D, E, K, R) allowed in an APR. Returns ======= :aggregation_propensity_df: pd.DataFrame pandas DataFrame of shape 1 x 2 with columns: ``AggregProneRegions`` — number of non-overlapping-scored APR windows; ``AggregProneFraction`` — percentage of residues covered by any APR. """ #validate input protein sequence sequence = _validate_sequence(sequence) n = len(sequence) charged = frozenset("DEKR") #track which residue positions are covered by at least one APR covered = [False] * n apr_count = 0 #score each window for i in range(n - window + 1): seg = sequence[i:i + window] avg_hydro = sum(kyte_doolittle_[aa] for aa in seg) / window n_charged = sum(1 for aa in seg if aa in charged) if avg_hydro >= hydrophobicity_threshold and n_charged < charge_threshold: apr_count += 1 for j in range(i, i + window): covered[j] = True apr_fraction = round(sum(covered) / n * 100, 3) aggregation_propensity_df = pd.DataFrame( [[apr_count, apr_fraction]], columns=["AggregProneRegions", "AggregProneFraction"] ) return aggregation_propensity_df
################################## Shannon Entropy ########################################
[docs] def shannon_entropy(sequence: str) -> pd.DataFrame: """ Calculate the Shannon Entropy of a protein sequence. Shannon entropy quantifies the diversity of the amino acid composition using the information-theoretic formula: H = -sum_i ( p_i * log2(p_i) ) where p_i is the fractional frequency of amino acid i in the sequence. A value near zero indicates a low-complexity or repetitive sequence; the theoretical maximum is log2(20) ≈ 4.322 bits for a perfectly uniform distribution across all 20 canonical amino acids. Shannon entropy is widely used as a sequence quality filter and complexity measure in machine-learning pipelines [1]. Parameters ========== :sequence: str protein sequence. Returns ======= :shannon_entropy_df: pd.DataFrame pandas DataFrame of shape 1 x 1 with column ``ShannonEntropy``. Value is rounded to 3 decimal places. """ #validate input protein sequence sequence = _validate_sequence(sequence) n = len(sequence) #count occurrences of each amino acid present in the sequence counts = {} for aa in sequence: counts[aa] = counts.get(aa, 0) + 1 #compute entropy using Shannon formula: H = -sum(p_i * log2(p_i)) entropy = 0.0 for count in counts.values(): p = count / n entropy -= p * math.log2(p) shannon_entropy_df = pd.DataFrame([[round(entropy, 3)]], columns=["ShannonEntropy"]) return shannon_entropy_df
############################### Hydrophobic Moment ########################################
[docs] def hydrophobic_moment(sequence: str, window: int = 11, angle: float = 100) -> pd.DataFrame: """ Calculate the mean and maximum Hydrophobic Moment of a protein sequence. The hydrophobic moment measures the amphiphilicity of a helical segment — the directional asymmetry of hydrophobicity projected around a helical axis [19]. For each window of length ``window``, the moment is: mu = sqrt( (sum H_i * sin(i * angle))^2 + (sum H_i * cos(i * angle))^2 ) / window where H_i is the Eisenberg hydrophobicity of residue i and angle is in degrees. Default angle of 100° corresponds to an alpha-helix; 160° is commonly used for beta-strands. Parameters ========== :sequence: str protein sequence. :window: int (default=11) sliding window length in residues. :angle: float (default=100) residue rotation angle in degrees (100° = alpha-helix). Returns ======= :hydrophobic_moment_df: pd.DataFrame pandas DataFrame of shape 1 x 2 with columns ``HydrophobicMoment_Mean`` and ``HydrophobicMoment_Max``. """ #validate input protein sequence sequence = _validate_sequence(sequence) n = len(sequence) #use full sequence as single window when shorter than default window effective_window = min(window, n) angle_rad = angle * math.pi / 180.0 moments = [] #compute hydrophobic moment for each window position using Eisenberg scale for i in range(n - effective_window + 1): seg = sequence[i:i + effective_window] hsin = sum(hydrophobicity_[aa] * math.sin(j * angle_rad) for j, aa in enumerate(seg)) hcos = sum(hydrophobicity_[aa] * math.cos(j * angle_rad) for j, aa in enumerate(seg)) mu = math.sqrt(hsin ** 2 + hcos ** 2) / effective_window moments.append(mu) mean_moment = round(sum(moments) / len(moments), 3) max_moment = round(max(moments), 3) hydrophobic_moment_df = pd.DataFrame( [[mean_moment, max_moment]], columns=["HydrophobicMoment_Mean", "HydrophobicMoment_Max"] ) return hydrophobic_moment_df
############################## Pseudo Amino Acid Composition ##############################
[docs] def pseudo_amino_acid_composition(sequence: str, lamda: int = 30, weight: float = 0.05, properties: list[str] | None = None) -> pd.DataFrame: """ Pseudo amino acid composition (PAAComp) combines the vanilla amino acid composition descriptor with additional local features, such as correlation between residues of a certain distance, as amino acid composition doesn't take into account sequence order info. The pseudo components of the descriptor are a series rank-different correlation factors [4, 5]. The first 20 components are a weighted sum of the amino acid composition and 30 are physicochemical square correlations as dictated by the lamda and properties parameters. This generates an output of [1, (20 + lamda)] = 1 x 50 when using the default lamda of 30. By default, the physicochemical properties used are hydrophobicity and hydrophillicity, with a lamda of 30 and weight of 0.05. Parameters ========== :lamda: int rank of correlation. Number of calculable descriptors depends on lamda value. :weight: float weighting factor. :properties: list list of dicts of physicochemical/structural property values for amino acids. Returns ======= :pseudo_amino_acid_composition_df: pd.Dataframe output dataframe of calculated pseudo amino acid composition descriptors for input sequence. Dataframe will be of the shape 1 x N, where N is the number of features calculated from the descriptor (20 + lamda) and 1 is the input sequence. By default, the shape will be 1 x 50 (using default lamda=30). """ #validate input protein sequence sequence = _validate_sequence(sequence) #set lamda to its default value of 30 if <0, or > sequence len or not an int if ((lamda < 0) or (lamda > len(sequence)) or not isinstance(lamda, int)): lamda = 30 #validate weight, set default weight of 0.05 if invalid value input if ((weight < 0) or not (isinstance(weight, (float, int)))): weight = 0.05 #normalize None to empty list before further processing if properties is None: properties = [] #cast properties to list if not (isinstance(properties, list)): properties = [properties] property_values = [] #by default use properties (hydrphocity, hydrophilicty, residue mass), #otherwise get the individual property values from aaindex1 using accession numbers if not properties: property_values = [hydrophobicity_, hydrophilicity_, residue_mass_] else: for prop in properties: if (prop not in aaindex1.record_codes()): raise ValueError(f"Accession number not found in aaindex1: {prop}.") property_values.append(aaindex1[prop].values) right_part = 0.0 psuedo_aac = {} rightpart = [] #calculate correlation factor for protein sequence using propeties for i in range(lamda): right_part = right_part + sequence_order_correlation_factor(sequence, i+1, property_values) #calculate amino acid composition aa_comp = amino_acid_composition(sequence) #compute first 20 pseudo amino acid descriptor components based on property_values, #append descriptor values to dict temp = 1 + weight + right_part for index, i in enumerate(amino_acids): psuedo_aac["PAAC_" + str(index + 1)] = round(aa_comp[i].values[0] / temp, 3) #calculate correlation factor for protein sequence using propeties for i in range(lamda): rightpart.append(sequence_order_correlation_factor(sequence, i + 1, property_values)) #compute last 20+lambda pseudo amino acid descriptor components based on property_values, #append descriptor values to dict temp = 1 + weight * sum(rightpart) for index in range(20, 20 + lamda): psuedo_aac["PAAC_" + str(index + 1)] = round( weight * rightpart[index - 20] / temp * 100, 3) #transform values and columns to DataFrame psuedo_amino_acid_composition_df = pd.DataFrame([list(psuedo_aac.values())], columns=list(psuedo_aac.keys())) return psuedo_amino_acid_composition_df
[docs] def sequence_order_correlation_factor(sequence: str, k: int = 1, properties: list[dict[str, float]] | None = None) -> float: """ Calculating sequence order correlation factor with gap equal to k based on the given input properities for a protein sequence. Parameters ========== :k: int gap between amino acids in the sequence. :properties: list list of dicts of physicochemical/structural property values for amino acids. Returns ======= :result: float correlation factor value for the sequence using the correlation function for the particular sequence. """ # default to empty list if not provided (avoid mutable default argument) if properties is None: properties = [] correlation_factor = [] #iterate through sequence using correlation function to calculate sequence order #between its amino acids with gap=k for i in range(len(sequence) - k): aa1 = sequence[i] aa2 = sequence[i + k] correlation_factor.append(correlation_function(aa1, aa2, properties)) #round array of correlation factors correlation_factor_ = round(sum(correlation_factor) / (len(sequence) - k), 3) return correlation_factor_
[docs] def correlation_function(Ri: str = "S", Rj: str = "D", properties: list[dict[str, float]] | None = None) -> float: """ Calculate the correlation between 2 input amino acids based on the physicochemical/structural properties from the protein sequence. Parameters ========== :Ri: str 1st amino acid. :Rj: str 2nd amino acid. :properties: list list of dicts of physicochemical/structural property values for amino acids. Returns ======= :correlation: float correlation value for the two input amino acids based on input properties. """ # default to empty list if not provided (avoid mutable default argument) if properties is None: properties = [] theta = 0.0 #iterate over each property, calculating the correlation from the normalzied values for i in range(len(properties)): temp = normalize_property(properties[i]) theta = theta + math.pow(temp[Ri] - temp[Rj], 2) #calculate correlation for all properties correlation = round(theta / len(properties), 3) return correlation
[docs] def normalize_property(properties: dict[str, float]) -> dict[str, float]: """ Normalize physicochemical/structural property values using their mean and standard deviation. Parameters ========== :properties: dict dictionary of amino acid values for a physicochemical property. Returns ======= :normalized_vals: dict dict of normalized property values. """ normalized_vals = {} #iterate over all property values, calculate their mean and standard dev for i, j in properties.items(): mean = sum(properties.values()) / len(properties.values()) normalized_vals[i] = (j - mean) / _std(properties.values(), mean, ddof=0) return normalized_vals
def _std(prop: list[float], mean: float, ddof: int = 1) -> float: """ Calculate standard deviation from list of physicochemical values using the mean of their values. Parameters ========== :prop: dict dictionary of property values for each amino acid. :mean: float mean of physicochemical property values. :ddof: int (default=1) delta degrees of freedom. Returns ======= :std_: float calculated standard deviation. """ temp = [math.pow(i - mean, 2) for i in prop] std_ = math.sqrt(sum(temp) / (len(prop) - ddof)) return std_ ######################## Amphiphilic Pseudo Amino Acid Composition ########################
[docs] def amphiphilic_pseudo_amino_acid_composition(sequence: str, lamda: int = 30, weight: float = 0.5, properties: list[dict[str, float]] = [hydrophobicity_, hydrophilicity_]) -> pd.DataFrame: """ Amphiphillic pseudo amino acid composition (APAAComp) has the same form as the amino acid composition, but contains much more information that is related to the sequence order of a protein and the distribution of the hydrophobic and hydrophilic amino acids along its chain [5]. The first 20 numbers in the descriptor are the components of the conventional amino acid composition; the next 2*λ numbers are a set of correlation factors that reflect different hydrophobicity and hydrophilicity distribution patterns along a protein chain. Parameters ========== :sequence: str protein sequence. :lamda: int rank of correlation. Number of calculable descriptors depends on lambda value. :weight: float weighting factor. :properties: list (default=[hydrophobicity, hydrophilicity]) list of dicts of physicochemical/structural property values for amino acids. Returns ======= :amp_pseudo_amino_acid_composition_df: pd.Dataframe output dataframe of calculated amphiphilic pseudo amino acid composition descriptors for input sequence. Dataframe will be of the shape 1 x N, where N is the number of features calculated from the descriptor (20 + 2*lambda) and 1 is the input sequence. By default, the shape will be 1 x 80 (using default lamda=30). """ #validate input protein sequence sequence = _validate_sequence(sequence) #set lamda to its default value of 30 if <0, or > sequence len or not an int if ((lamda < 0) or (lamda > len(sequence)) or not isinstance(lamda, int)): lamda = 30 #validate weight, set default weight of 0.5 if invalid value input if ((weight < 0) or not (isinstance(weight, (float, int)))): weight = 0.5 #cast properties to list if not (isinstance(properties, list)): properties = [properties] amp_pseudo_amino_acid_composition = {} #calculate correlation factor in protein sequence rightpart = 0.0 for i in range(lamda): rightpart = rightpart + sum( amphiphilic_sequence_order_correlation_factor(sequence, k=i + 1)) #calculate amino acid composition aa_composition = amino_acid_composition(sequence) #compute first 20 pseudo amino acid descriptor components based on properties temp = 1 + weight * rightpart for index, i in enumerate(amino_acids): amp_pseudo_amino_acid_composition["APAAC_" + str(index + 1)] = round(aa_composition[i].values[0] / temp, 3) #calculate correlation factor in protein sequence rightpart = [] for i in range(lamda): temp = amphiphilic_sequence_order_correlation_factor(sequence, k=i + 1) rightpart.append(temp[0]) rightpart.append(temp[1]) #compute remaining (20 + 2 * lamda) amphiphillic composition values temp = 1 + weight * sum(rightpart) for index in range(20, 20 + 2 * lamda): amp_pseudo_amino_acid_composition["APAAC_" + str(index + 1)] = round( weight * rightpart[index - 20] / temp * 100, 3) #transform values and columns to DataFrame amp_pseudo_amino_acid_composition_df = pd.DataFrame([list(amp_pseudo_amino_acid_composition.values())], columns=list(amp_pseudo_amino_acid_composition.keys())) return amp_pseudo_amino_acid_composition_df
[docs] def amphiphilic_sequence_order_correlation_factor(sequence: str, k: int = 1) -> list[float]: """ Calculate Amphipillic sequence order correlation factor for sequence with gap=k. Parameters ========== :sequence: str protein sequence. :k: int (default=1) gap between amino acids in the sequence. Returns ======= :correlation_factor: list list of correlation factors for both hydrophobicity and hydrophillicty. """ correlation_factor_hydrophobicity = [] correlation_factor_hydrophilicity = [] #iterate over sequence, calculating correlation factors using both properties for i in range(len(sequence) - k): aa1 = sequence[i] aa2 = sequence[i + k] corelation_function = amphiphilic_correllation_function(aa1, aa2) correlation_factor_hydrophobicity.append(corelation_function[0]) correlation_factor_hydrophilicity.append(corelation_function[1]) #append hydrophobicity and hydrophilicity values to array correlation_factor = [] correlation_factor.append(round(sum(correlation_factor_hydrophobicity) / (len(sequence) - k), 3)) correlation_factor.append(round(sum(correlation_factor_hydrophilicity) / (len(sequence) - k), 3)) return correlation_factor
[docs] def amphiphilic_correllation_function(Ri: str = "S", Rj: str = "D") -> tuple[float, float]: """ Calculate correlation value based on hydrophobicity and hydrophillicity property values for input amino acids Ri and Rj in sequence. Parameters ========== :Ri: str 1st amino acid. :Rj: str 2nd amino acid. Returns ======= :theta1, theta2: float correlation values for input property values. """ #normalize properties _hydrophobicity = normalize_property(hydrophobicity_) _hydrophilicity = normalize_property(hydrophilicity_) #calculate correlation using hydrophobicity & hydrophilicty properties theta1 = round(_hydrophobicity[Ri] * _hydrophobicity[Rj], 3) theta2 = round(_hydrophilicity[Ri] * _hydrophilicity[Rj], 3) return theta1, theta2