################################################################################
############### Quasi Sequence Order ###############
################################################################################
from __future__ import annotations
import os
import json
from difflib import get_close_matches
import pandas as pd
import math
from json import JSONDecodeError
import sys
from . import composition
"""
References
==========
[1] Kuo-Chen Chou. Prediction of Protein Subcellar Locations by Incorporating
Quasi-Sequence-Order Effect. Biochemical and Biophysical Research Communications,
2000, 278, 477-483.
[2] Kuo-Chen Chou and Yu-Dong Cai. Prediction of Protein Subcellular Locations by
GO-FunD-PseAA Predictor. Biochemical and Biophysical Research Communications,
2004, 320, 1236-1239.
[3] Gisbert Schneider and Paul Wrede. The Rational Design of Amino Acid Sequences
by Artifical Neural Networks and Simulated Molecular Evolution: Do Novo Design
of an Idealized Leader Cleavge Site. Biophys Journal, 1994, 66, 335-344.
[4] Grantham, R. (1974-09-06). "Amino acid difference formula to help explain protein
evolution". Science. 185 (4154): 862–864. Bibcode:1974Sci...185..862G.
doi:10.1126/science.185.4154.862. ISSN 0036-8075. PMID 4843792. S2CID 35388307.
"""
#list of amino acids
from ._constants import amino_acids, _validate_sequence
############################# Sequence Order Coupling Number ##############################
[docs]
def sequence_order_coupling_number_(sequence: str, d: int = 1, distance_matrix: str = "schneider-wrede") -> float:
"""
Calculate Sequence Order Coupling Number (SOCN) features for input protein sequence.
SOCN computes the dissimilarity between amino acid pairs. The distance between
amino acid pairs is determined by d which varies between 1 to lag. For each d, it
computes the sum of the dissimilarities of all amino acid pairs. The output will be
a single float value representing the SOCN [1].
This function should not be confused with sequence_order_coupling_number() below
which calculates the multiple SOCN descriptor values for the input sequence
according to the value of lag; with the default of lag=30, 30 SOCN's will be
calculated.
Parameters
==========
:sequence: str
protein sequence.
:d: int (default=1)
gap between two amino acids.
:distance_matrix: str (default="schneider-wrede")
physicochemical distance matrix to use; accepts "schneider-wrede" or "grantham".
Returns
=======
:socn: float
calculated sequence order coupling number value from sequence.
"""
#validate input protein sequence
sequence = _validate_sequence(sequence)
#validate input distance matrix is valid, get its closest match using closeness function
dist_matrix_matches = get_close_matches(distance_matrix, ["schneider-wrede", "grantham"], cutoff=0.6)
if (dist_matrix_matches != []):
distance_matrix = dist_matrix_matches[0] #set desc to closest descriptor match found
else:
raise ValueError(f"Could not find a match for the input distance matrix {distance_matrix} in"
f" list of available valid distance matrices:\n{['schneider-wrede', 'grantham']}.")
#get full path to selected distance matric
distance_matrix = os.path.join(os.path.dirname(os.path.abspath(sys.modules['protpy'].__file__)), "data", distance_matrix + "-physicochemical-distance-matrix.json")
#open distance matrix json if present
try:
with open(distance_matrix, "r") as f:
distance_matrix = json.load(f)
except (OSError, json.JSONDecodeError):
raise JSONDecodeError(f'Error parsing distance matrix JSON file: {distance_matrix}.')
tau = 0.0
#iterate over length of sequence minus gap, incrementing the SOCN at each iteration using
#the values from the distance matrix with the current and next amino acid, round to 2 dp
for aa in range(len(sequence) - d):
current_aa = sequence[aa]
next_aa = sequence[aa + d]
tau = tau + math.pow(distance_matrix[current_aa + next_aa], 2)
return round(tau, 3)
[docs]
def sequence_order_coupling_number(sequence: str, lag: int = 30,
distance_matrix: str = "schneider-wrede") -> pd.DataFrame:
"""
Calculate Sequence Order Coupling Number (SOCN) features for input protein sequence.
SOCN computes the dissimilarity between amino acid pairs. The distance between
amino acid pairs is determined by d which varies between 1 to lag. For each d, it
computes the sum of the dissimilarities of all amino acid pairs. The number of
output features can be calculated as N, where N = lag, by default this value
is 30 which generates an output of 1 x 30.
Parameters
==========
:sequence: str
protein sequence.
:lag: int (default=30)
maximum gap between 2 amino acids; the protein length should be larger than lag.
:distance_matrix: str (default="schneider-wrede")
physicochemical distance matrix to use; accepts "schneider-wrede" or "grantham".
Returns
=======
:sequence_order_df: pd.Dataframe
Dataframe of SOCN descriptor values for all protein sequences. Output
will be of the shape 1 x N, where N is the number of features calculated
from the descriptor and N=lag.
"""
#validate input protein sequence
sequence = _validate_sequence(sequence)
#raise value error if int cant be parsed from input lag
try:
lag = int(lag)
except (ValueError, TypeError):
raise ValueError(f"Invalid lag value input: {lag}.")
#set dataframe column name prefixes based on input distance matrix, raise error if invalid name input
col_prefix = ""
if (distance_matrix == "schneider-wrede"):
col_prefix = "SOCN_SW"
elif (distance_matrix == "grantham"):
col_prefix = "SOCN_Grant"
else:
raise ValueError(f"Invalid distance matrix name input {distance_matrix}, values can be schneider-wrede or grantham.")
#validate lag, set default lag of 30 if invalid value input
if (lag>=len(sequence) or (lag<0)):
lag = 30
seq_order = {}
#iterate over sequence with lag, calculating SOCN using input distance matrix values
for i in range(lag):
tau = sequence_order_coupling_number_(sequence, i+1, distance_matrix)
#append SOCN column and value to dict
seq_order[col_prefix + str(i + 1)] = round(tau, 3)
#transform SOCN data into pandas dataframe
sequence_order_df = pd.DataFrame([list(seq_order.values())], columns=list(seq_order.keys()))
return sequence_order_df
[docs]
def sequence_order_coupling_number_all(sequence: str, lag: int = 30) -> pd.DataFrame:
"""
Calculate Sequence Order Coupling Number (SOCN) descriptor values of input protein
sequence using both matrices (schneider-wrede & grantham) [3]. The distance between
amino acid pairs is determined by d which varies between 1 to lag. For each d, it
computes the sum of the dissimilarities of all amino acid pairs. Each matrix generates
an output of 1 x N, where N is the lag, so using the two concatenated matrices, the
output will be 1 x (N * 2). With a default lag of 30, this will generate an output
of 1 x 60.
Parameters
==========
:sequence: str
protein sequence.
:lag: int (default=30)
maximum gap between 2 amino acids; the protein length should be larger than lag.
Returns
=======
:socn_all: pd.Dataframe
Concatenated dataframe of SOCN descriptor values of input protein sequence using
both distance matrices. The number of output features can be calculated as N * 2,
where N = lag, by default this value is 30 which generates an output of 1 x 30
for each matrix, with the two matrices the output will be 1 x (30*2), using the
default lag.
"""
#validate input protein sequence
sequence = _validate_sequence(sequence)
#raise value error if int cant be parsed from input lag
try:
lag = int(lag)
except (ValueError, TypeError):
raise ValueError(f"Invalid lag value input: {lag}.")
#validate lag, set default lag of 30 if invalid value input
if (lag>=len(sequence) or (lag<0)):
lag = 30
#calculate SOCN using schneider-wrede distance matrix
socn_schneider = sequence_order_coupling_number(sequence, lag, "schneider-wrede")
#calculate SOCN using grantham distance matrix
socn_grantham = sequence_order_coupling_number(sequence, lag, "grantham")
#merge 2 dataframes
socn_all = pd.concat([socn_schneider, socn_grantham], axis=1)
return socn_all
################################## Quasi Sequence Order ###################################
[docs]
def quasi_sequence_order(sequence: str, lag: int = 30, weight: float = 0.1,
distance_matrix: str = "schneider-wrede") -> pd.DataFrame:
"""
Calculate Quasi Sequence Order (QSO) features for the protein sequences.
The quasi-sequence-order descriptors were proposed by K.C. Chou, et.al. [1].
They are derived from the distance matrix between the 20 amino acids. By default,
the Scheider-Wrede physicochemical distance matrix was used. Also utilised in
the descriptor calculation is the Grantham chemical distance matrix. Both of
these matrices are used by Grantham et. al. in the calculation of the descriptor
[3, 4]. N + 20 values are calculated per sequence, where N is the lag, with a
default lag of 30, the output will be 1 x 50. There is also a weighting factor
that can be assigned to determine that weight per amino acid.
Parameters
==========
:sequence: str
protein sequence.
:lag: int (default=30)
maximum gap between 2 amino acids; the protein length should be larger than lag.
:weight: float (default = 0.1)
weighting factor.
:distance_matrix: str (default="schneider-wrede")
physicochemical distance matrix to use; accepts "schneider-wrede" or "grantham".
Returns
=======
:quasi_sequence_order_df: pd.Dataframe
dataframe of quasi-sequence-order descriptor values for the protein sequences,
with output shape 1 x (N + 20), where N is the lag. With a default
lag of 30 the output will be 1 x 50 per sequence.
"""
#validate input protein sequence
sequence = _validate_sequence(sequence)
#raise value error if int can't be parsed from input lag
try:
lag = int(lag)
except (ValueError, TypeError):
raise ValueError(f"Invalid lag value input: {lag}.")
#validate lag, set default lag of 30 if invalid value input
if (lag>=len(sequence) or (lag<0)):
lag = 30
#validate weight, set default weight of 0.1 if invalid value input
if ((weight < 0) or not (isinstance(weight, (float, int)))):
weight = 0.1
#set dataframe column name prefixes based on input distance matrix, raise error if invalid name input
col_prefix = ""
if (distance_matrix == "schneider-wrede"):
col_prefix = "QSO_SW"
elif (distance_matrix == "grantham"):
col_prefix = "QSO_Grant"
else:
raise ValueError(f"Invalid distance matrix name input {distance_matrix}, values can be schneider-wrede or grantham.")
quasi_sequence_order = {}
right_part = 0.0
#calculate first 20 quasi sequence order using sequence order coupling number for
#proteins using lag and specificed physicochemical distance matrix
for aa in range(lag):
right_part = right_part + sequence_order_coupling_number_(sequence, aa+1, distance_matrix)
#get amino acid composition
aa_comp = composition.amino_acid_composition(sequence)
#iterate over amino acids, calculating descriptor value
temp = 1 + weight * right_part
for index, aa in enumerate(amino_acids):
quasi_sequence_order[col_prefix + str(index + 1)] = round(aa_comp[aa].values[0] / temp, 6)
#calculate SOCN up until maxlag
_right_part = []
for i in range(lag):
_right_part.append(
sequence_order_coupling_number_(sequence, i+1, distance_matrix))
#calculate the last maxlag descriptor values
temp = 1 + weight * sum(_right_part)
for index in range(20, 20 + lag):
quasi_sequence_order[col_prefix + str(index + 1)] = round(
weight * _right_part[index - 20] / temp, 6)
#transform descriptor data into pandas dataframe
quasi_sequence_order_df = pd.DataFrame([list(quasi_sequence_order.values())],
columns=list(quasi_sequence_order.keys()))
return quasi_sequence_order_df
[docs]
def quasi_sequence_order_all(sequence: str, lag: int = 30, weight: float = 0.1) -> pd.DataFrame:
"""
Calculate Quasi Sequence Order features for input protein sequence using
both physicochemical distance matrices. Concatenate into one output dataframe.
The output will be in the shape 1 x ((N + 20)*2), where ((N + 20)*2) is the
quasi sequence order output from one matrix and N is the lag. The output
is multiplied by two to take into account the 2 matrices being concatenated.
There is also a weighting factor that can be assigned to determine that
weight per amino acid.
Parameters
==========
:sequence: str
protein sequence.
:lag: int (default=30)
maximum gap between 2 amino acids; the protein length should be larger than lag.
:weight: float (default=0.1)
weighting factor.
Returns
=======
:quasi_sequence_order_all_df: pd.Dataframe
dataframe of quasi-sequence-order descriptor values for the
protein sequences, with output shape 1 x ((N + 20)*2) where ((N + 20)*2) is the
quasi sequence order output from one matrix and N is the lag. The output
is multiplied by two to take into account the 2 matrices being concatenated.
"""
#validate input protein sequence
sequence = _validate_sequence(sequence)
#raise value error if int can't be parsed from input lag
try:
lag = int(lag)
except (ValueError, TypeError):
raise ValueError(f"Invalid lag value input: {lag}.")
#validate lag, set default lag of 30 if invalid value input
if (lag>=len(sequence) or (lag<0)):
lag = 30
#validate weight, set default weight if invalid value input
if ((weight < 0) or not (isinstance(weight, (float, int)))):
weight = 0.1
#calculate quasi seq order using schneider distance matrix
quasi_schneider = quasi_sequence_order(sequence, lag, weight=weight,
distance_matrix="schneider-wrede")
#calculate quasi seq order using schneider distance matrix
quasi_grantham = quasi_sequence_order(sequence, lag, weight=weight,
distance_matrix="grantham")
#concat quasi sequence order dataframes
quasi_sequence_order_all_df = pd.concat([quasi_schneider, quasi_grantham], axis=1)
return quasi_sequence_order_all_df