Source code for protpy.sequence_order

################################################################################
###############              Quasi Sequence Order                ###############
################################################################################

from __future__ import annotations

import os
import json
from difflib import get_close_matches
import pandas as pd
import math
from json import JSONDecodeError
import sys
from . import composition

"""
References
==========
[1] Kuo-Chen Chou. Prediction of Protein Subcellar Locations by Incorporating
    Quasi-Sequence-Order Effect. Biochemical and Biophysical Research Communications,
    2000, 278, 477-483.
[2] Kuo-Chen Chou and Yu-Dong Cai. Prediction of Protein Subcellular Locations by
    GO-FunD-PseAA Predictor. Biochemical and Biophysical Research Communications,
    2004, 320, 1236-1239.
[3] Gisbert Schneider and Paul Wrede. The Rational Design of Amino Acid Sequences
    by Artifical Neural Networks and Simulated Molecular Evolution: Do Novo Design
    of an Idealized Leader Cleavge Site. Biophys Journal, 1994, 66, 335-344.
[4] Grantham, R. (1974-09-06). "Amino acid difference formula to help explain protein
    evolution". Science. 185 (4154): 862–864. Bibcode:1974Sci...185..862G.
    doi:10.1126/science.185.4154.862. ISSN 0036-8075. PMID 4843792. S2CID 35388307.
"""

#list of amino acids
from ._constants import amino_acids, _validate_sequence

############################# Sequence Order Coupling Number ##############################

[docs] def sequence_order_coupling_number_(sequence: str, d: int = 1, distance_matrix: str = "schneider-wrede") -> float: """ Calculate Sequence Order Coupling Number (SOCN) features for input protein sequence. SOCN computes the dissimilarity between amino acid pairs. The distance between amino acid pairs is determined by d which varies between 1 to lag. For each d, it computes the sum of the dissimilarities of all amino acid pairs. The output will be a single float value representing the SOCN [1]. This function should not be confused with sequence_order_coupling_number() below which calculates the multiple SOCN descriptor values for the input sequence according to the value of lag; with the default of lag=30, 30 SOCN's will be calculated. Parameters ========== :sequence: str protein sequence. :d: int (default=1) gap between two amino acids. :distance_matrix: str (default="schneider-wrede") physicochemical distance matrix to use; accepts "schneider-wrede" or "grantham". Returns ======= :socn: float calculated sequence order coupling number value from sequence. """ #validate input protein sequence sequence = _validate_sequence(sequence) #validate input distance matrix is valid, get its closest match using closeness function dist_matrix_matches = get_close_matches(distance_matrix, ["schneider-wrede", "grantham"], cutoff=0.6) if (dist_matrix_matches != []): distance_matrix = dist_matrix_matches[0] #set desc to closest descriptor match found else: raise ValueError(f"Could not find a match for the input distance matrix {distance_matrix} in" f" list of available valid distance matrices:\n{['schneider-wrede', 'grantham']}.") #get full path to selected distance matric distance_matrix = os.path.join(os.path.dirname(os.path.abspath(sys.modules['protpy'].__file__)), "data", distance_matrix + "-physicochemical-distance-matrix.json") #open distance matrix json if present try: with open(distance_matrix, "r") as f: distance_matrix = json.load(f) except (OSError, json.JSONDecodeError): raise JSONDecodeError(f'Error parsing distance matrix JSON file: {distance_matrix}.') tau = 0.0 #iterate over length of sequence minus gap, incrementing the SOCN at each iteration using #the values from the distance matrix with the current and next amino acid, round to 2 dp for aa in range(len(sequence) - d): current_aa = sequence[aa] next_aa = sequence[aa + d] tau = tau + math.pow(distance_matrix[current_aa + next_aa], 2) return round(tau, 3)
[docs] def sequence_order_coupling_number(sequence: str, lag: int = 30, distance_matrix: str = "schneider-wrede") -> pd.DataFrame: """ Calculate Sequence Order Coupling Number (SOCN) features for input protein sequence. SOCN computes the dissimilarity between amino acid pairs. The distance between amino acid pairs is determined by d which varies between 1 to lag. For each d, it computes the sum of the dissimilarities of all amino acid pairs. The number of output features can be calculated as N, where N = lag, by default this value is 30 which generates an output of 1 x 30. Parameters ========== :sequence: str protein sequence. :lag: int (default=30) maximum gap between 2 amino acids; the protein length should be larger than lag. :distance_matrix: str (default="schneider-wrede") physicochemical distance matrix to use; accepts "schneider-wrede" or "grantham". Returns ======= :sequence_order_df: pd.Dataframe Dataframe of SOCN descriptor values for all protein sequences. Output will be of the shape 1 x N, where N is the number of features calculated from the descriptor and N=lag. """ #validate input protein sequence sequence = _validate_sequence(sequence) #raise value error if int cant be parsed from input lag try: lag = int(lag) except (ValueError, TypeError): raise ValueError(f"Invalid lag value input: {lag}.") #set dataframe column name prefixes based on input distance matrix, raise error if invalid name input col_prefix = "" if (distance_matrix == "schneider-wrede"): col_prefix = "SOCN_SW" elif (distance_matrix == "grantham"): col_prefix = "SOCN_Grant" else: raise ValueError(f"Invalid distance matrix name input {distance_matrix}, values can be schneider-wrede or grantham.") #validate lag, set default lag of 30 if invalid value input if (lag>=len(sequence) or (lag<0)): lag = 30 seq_order = {} #iterate over sequence with lag, calculating SOCN using input distance matrix values for i in range(lag): tau = sequence_order_coupling_number_(sequence, i+1, distance_matrix) #append SOCN column and value to dict seq_order[col_prefix + str(i + 1)] = round(tau, 3) #transform SOCN data into pandas dataframe sequence_order_df = pd.DataFrame([list(seq_order.values())], columns=list(seq_order.keys())) return sequence_order_df
[docs] def sequence_order_coupling_number_all(sequence: str, lag: int = 30) -> pd.DataFrame: """ Calculate Sequence Order Coupling Number (SOCN) descriptor values of input protein sequence using both matrices (schneider-wrede & grantham) [3]. The distance between amino acid pairs is determined by d which varies between 1 to lag. For each d, it computes the sum of the dissimilarities of all amino acid pairs. Each matrix generates an output of 1 x N, where N is the lag, so using the two concatenated matrices, the output will be 1 x (N * 2). With a default lag of 30, this will generate an output of 1 x 60. Parameters ========== :sequence: str protein sequence. :lag: int (default=30) maximum gap between 2 amino acids; the protein length should be larger than lag. Returns ======= :socn_all: pd.Dataframe Concatenated dataframe of SOCN descriptor values of input protein sequence using both distance matrices. The number of output features can be calculated as N * 2, where N = lag, by default this value is 30 which generates an output of 1 x 30 for each matrix, with the two matrices the output will be 1 x (30*2), using the default lag. """ #validate input protein sequence sequence = _validate_sequence(sequence) #raise value error if int cant be parsed from input lag try: lag = int(lag) except (ValueError, TypeError): raise ValueError(f"Invalid lag value input: {lag}.") #validate lag, set default lag of 30 if invalid value input if (lag>=len(sequence) or (lag<0)): lag = 30 #calculate SOCN using schneider-wrede distance matrix socn_schneider = sequence_order_coupling_number(sequence, lag, "schneider-wrede") #calculate SOCN using grantham distance matrix socn_grantham = sequence_order_coupling_number(sequence, lag, "grantham") #merge 2 dataframes socn_all = pd.concat([socn_schneider, socn_grantham], axis=1) return socn_all
################################## Quasi Sequence Order ###################################
[docs] def quasi_sequence_order(sequence: str, lag: int = 30, weight: float = 0.1, distance_matrix: str = "schneider-wrede") -> pd.DataFrame: """ Calculate Quasi Sequence Order (QSO) features for the protein sequences. The quasi-sequence-order descriptors were proposed by K.C. Chou, et.al. [1]. They are derived from the distance matrix between the 20 amino acids. By default, the Scheider-Wrede physicochemical distance matrix was used. Also utilised in the descriptor calculation is the Grantham chemical distance matrix. Both of these matrices are used by Grantham et. al. in the calculation of the descriptor [3, 4]. N + 20 values are calculated per sequence, where N is the lag, with a default lag of 30, the output will be 1 x 50. There is also a weighting factor that can be assigned to determine that weight per amino acid. Parameters ========== :sequence: str protein sequence. :lag: int (default=30) maximum gap between 2 amino acids; the protein length should be larger than lag. :weight: float (default = 0.1) weighting factor. :distance_matrix: str (default="schneider-wrede") physicochemical distance matrix to use; accepts "schneider-wrede" or "grantham". Returns ======= :quasi_sequence_order_df: pd.Dataframe dataframe of quasi-sequence-order descriptor values for the protein sequences, with output shape 1 x (N + 20), where N is the lag. With a default lag of 30 the output will be 1 x 50 per sequence. """ #validate input protein sequence sequence = _validate_sequence(sequence) #raise value error if int can't be parsed from input lag try: lag = int(lag) except (ValueError, TypeError): raise ValueError(f"Invalid lag value input: {lag}.") #validate lag, set default lag of 30 if invalid value input if (lag>=len(sequence) or (lag<0)): lag = 30 #validate weight, set default weight of 0.1 if invalid value input if ((weight < 0) or not (isinstance(weight, (float, int)))): weight = 0.1 #set dataframe column name prefixes based on input distance matrix, raise error if invalid name input col_prefix = "" if (distance_matrix == "schneider-wrede"): col_prefix = "QSO_SW" elif (distance_matrix == "grantham"): col_prefix = "QSO_Grant" else: raise ValueError(f"Invalid distance matrix name input {distance_matrix}, values can be schneider-wrede or grantham.") quasi_sequence_order = {} right_part = 0.0 #calculate first 20 quasi sequence order using sequence order coupling number for #proteins using lag and specificed physicochemical distance matrix for aa in range(lag): right_part = right_part + sequence_order_coupling_number_(sequence, aa+1, distance_matrix) #get amino acid composition aa_comp = composition.amino_acid_composition(sequence) #iterate over amino acids, calculating descriptor value temp = 1 + weight * right_part for index, aa in enumerate(amino_acids): quasi_sequence_order[col_prefix + str(index + 1)] = round(aa_comp[aa].values[0] / temp, 6) #calculate SOCN up until maxlag _right_part = [] for i in range(lag): _right_part.append( sequence_order_coupling_number_(sequence, i+1, distance_matrix)) #calculate the last maxlag descriptor values temp = 1 + weight * sum(_right_part) for index in range(20, 20 + lag): quasi_sequence_order[col_prefix + str(index + 1)] = round( weight * _right_part[index - 20] / temp, 6) #transform descriptor data into pandas dataframe quasi_sequence_order_df = pd.DataFrame([list(quasi_sequence_order.values())], columns=list(quasi_sequence_order.keys())) return quasi_sequence_order_df
[docs] def quasi_sequence_order_all(sequence: str, lag: int = 30, weight: float = 0.1) -> pd.DataFrame: """ Calculate Quasi Sequence Order features for input protein sequence using both physicochemical distance matrices. Concatenate into one output dataframe. The output will be in the shape 1 x ((N + 20)*2), where ((N + 20)*2) is the quasi sequence order output from one matrix and N is the lag. The output is multiplied by two to take into account the 2 matrices being concatenated. There is also a weighting factor that can be assigned to determine that weight per amino acid. Parameters ========== :sequence: str protein sequence. :lag: int (default=30) maximum gap between 2 amino acids; the protein length should be larger than lag. :weight: float (default=0.1) weighting factor. Returns ======= :quasi_sequence_order_all_df: pd.Dataframe dataframe of quasi-sequence-order descriptor values for the protein sequences, with output shape 1 x ((N + 20)*2) where ((N + 20)*2) is the quasi sequence order output from one matrix and N is the lag. The output is multiplied by two to take into account the 2 matrices being concatenated. """ #validate input protein sequence sequence = _validate_sequence(sequence) #raise value error if int can't be parsed from input lag try: lag = int(lag) except (ValueError, TypeError): raise ValueError(f"Invalid lag value input: {lag}.") #validate lag, set default lag of 30 if invalid value input if (lag>=len(sequence) or (lag<0)): lag = 30 #validate weight, set default weight if invalid value input if ((weight < 0) or not (isinstance(weight, (float, int)))): weight = 0.1 #calculate quasi seq order using schneider distance matrix quasi_schneider = quasi_sequence_order(sequence, lag, weight=weight, distance_matrix="schneider-wrede") #calculate quasi seq order using schneider distance matrix quasi_grantham = quasi_sequence_order(sequence, lag, weight=weight, distance_matrix="grantham") #concat quasi sequence order dataframes quasi_sequence_order_all_df = pd.concat([quasi_schneider, quasi_grantham], axis=1) return quasi_sequence_order_all_df