Usage
Installation
Install the latest version of protpy via pip:
pip3 install protpy --upgrade
Alternatively, clone the repository and install from source:
git clone https://github.com/amckenna41/protpy.git
cd protpy
pip install -e .
Importing protpy
import protpy as protpy
Loading a protein sequence from a FASTA file
from Bio import SeqIO
with open("protein.fasta") as f:
protein_seq = str(next(SeqIO.parse(f, "fasta")).seq)
Composition Descriptors
Calculate Amino Acid Composition (AAComp):
amino_acid_comp = protpy.amino_acid_composition(protein_seq)
# A C D E F ...
# 6.693 3.108 5.817 3.347 6.614 ...
Calculate Dipeptide Composition (DPComp):
dipeptide_comp = protpy.dipeptide_composition(protein_seq)
# AA AC AD AE AF ...
# 0.72 0.16 0.48 0.4 0.24 ...
Calculate Tripeptide Composition (TPComp):
tripeptide_comp = protpy.tripeptide_composition(protein_seq)
# AAA AAC AAD AAE AAF ...
# 1 0 0 2 0 ...
Calculate Pseudo Amino Acid Composition (PAAComp):
#using default parameters: lamda=30, weight=0.05, properties=[]
pseudo_comp = protpy.pseudo_amino_acid_composition(protein_seq)
# PAAC_1 PAAC_2 PAAC_3 PAAC_4 PAAC_5 ...
# 0.127 0.059 0.111 0.064 0.126 ...
Calculate Amphiphilic Pseudo Amino Acid Composition (APAAComp):
#using default parameters: lamda=30, weight=0.5
amphiphilic_comp = protpy.amphiphilic_pseudo_amino_acid_composition(protein_seq)
# APAAC_1 APAAC_2 APAAC_3 APAAC_4 APAAC_5 ...
# 6.624 3.076 5.757 3.312 6.546 ...
Calculate Shannon Entropy:
se = protpy.shannon_entropy(protein_seq)
# ShannonEntropy
# 4.163
Calculate GRAVY (Grand Average of Hydropathy):
gravy = protpy.gravy(protein_seq)
# GRAVY
# -0.045
#positive value = overall hydrophobic; negative value = overall hydrophilic
Calculate Aromaticity:
aromaticity = protpy.aromaticity(protein_seq)
# Aromaticity
# 0.118
Calculate Instability Index:
instability = protpy.instability_index(protein_seq)
# InstabilityIndex
# 31.836
#score < 40 = stable protein; score >= 40 = unstable
Calculate Isoelectric Point:
pi = protpy.isoelectric_point(protein_seq)
# IsoelectricPoint
# 5.412
Calculate Molecular Weight:
mw = protpy.molecular_weight(protein_seq)
# MolecularWeight
# 139122.355
Calculate Charge Distribution:
#using default pH: ph=7.4
charge = protpy.charge_distribution(protein_seq)
# PositiveCharge NegativeCharge NetCharge
# 99.526 114.956 -15.43
#using custom pH
charge = protpy.charge_distribution(protein_seq, ph=6.0)
Calculate Hydrophobic/Polar/Charged Composition (HPC):
hpc = protpy.hydrophobic_polar_charged_composition(protein_seq)
# Hydrophobic Polar Charged
# 44.542 32.669 18.247
Calculate Secondary Structure Propensity (SSP):
ssp = protpy.secondary_structure_propensity(protein_seq)
# Helix Sheet Coil
# 0.983 1.05 1.043
Calculate k-mer Composition:
#using default k=2 (dipeptide frequencies by Chou-Fasman class)
kmer = protpy.kmer_composition(protein_seq)
# AA AC AD ...
# 0.797 0.159 ... ...
#using custom k
kmer = protpy.kmer_composition(protein_seq, k=3)
Calculate Reduced Alphabet Composition:
#using default alphabet_size=6
reduced = protpy.reduced_alphabet_composition(protein_seq)
# Group_1 Group_2 Group_3 Group_4 Group_5 Group_6
# 25.339 34.741 9.163 9.084 10.837 10.837
#supported sizes: 2, 3, 4, 6
reduced = protpy.reduced_alphabet_composition(protein_seq, alphabet_size=4)
Calculate Motif Composition:
#using default built-in motifs (N-glycosylation, RGD, KDEL, CxxC, CAAX, PKA, dileucine, PEST)
motif = protpy.motif_composition(protein_seq)
# NxST_glycosylation RGD_integrin KDEL_retention CxxC_zinc_finger ...
# 23 0 0 2 ...
#using custom motif list
motif = protpy.motif_composition(protein_seq, motifs=[r'RGD', r'NxS'])
Calculate Amino Acid Pair Composition:
pair = protpy.amino_acid_pair_composition(protein_seq)
# 400-column DataFrame with class-annotated column names
# AA_Hydrophobic-Hydrophobic AA_Hydrophobic-Polar ...
# 0.797 0.159 ...
Calculate Aliphatic Index:
ai = protpy.aliphatic_index(protein_seq)
# AliphaticIndex
# 82.725
Calculate Extinction Coefficient:
ec = protpy.extinction_coefficient(protein_seq)
# ExtCoeff_Reduced ExtCoeff_Oxidized
# 140960 143335
Calculate Boman Index:
bi = protpy.boman_index(protein_seq)
# BomanIndex
# 0.119
Calculate Aggregation Propensity:
ap = protpy.aggregation_propensity(protein_seq)
# AggregProneRegions AggregProneFraction
# 58 11.793
Calculate Hydrophobic Moment:
#using default parameters: window=11, angle=100 (alpha-helix)
hm = protpy.hydrophobic_moment(protein_seq)
# HydrophobicMoment_Mean HydrophobicMoment_Max
# 0.272 0.813
Autocorrelation Descriptors
Calculate Moreaubroto Autocorrelation (MBAuto):
#using default parameters: lag=30, properties=[...], normalize=True
moreaubroto_auto = protpy.moreaubroto_autocorrelation(protein_seq)
# MBAuto_CIDH920105_1 MBAuto_CIDH920105_2 MBAuto_CIDH920105_3 ...
# -0.052 -0.104 -0.156 ...
Calculate Moran Autocorrelation (MAuto):
moran_auto = protpy.moran_autocorrelation(protein_seq)
# MAuto_CIDH920105_1 MAuto_CIDH920105_2 MAuto_CIDH920105_3 ...
# -0.07786 -0.07879 -0.07906 ...
Calculate Geary Autocorrelation (GAuto):
geary_auto = protpy.geary_autocorrelation(protein_seq)
# GAuto_CIDH920105_1 GAuto_CIDH920105_2 GAuto_CIDH920105_3 ...
# 1.057 1.077 1.04 ...
Conjoint Triad Descriptor
Calculate Conjoint Triad (CTriad):
conjoint_triad = protpy.conjoint_triad(protein_seq)
# 111 112 113 114 115 ...
# 7 17 11 3 6 ...
CTD Descriptors
Calculate CTD — Composition, Transition, Distribution:
#using default parameters: property="hydrophobicity", all_ctd=True
ctd = protpy.ctd(protein_seq)
# hydrophobicity_CTD_C_01 hydrophobicity_CTD_C_02 hydrophobicity_CTD_C_03 ...
# 0.279 0.386 0.335 ...
Sequence Order Descriptors
Calculate a single Sequence Order Coupling Number (SOCN):
#using default parameters: d=1, distance_matrix="schneider-wrede"
socn = protpy.sequence_order_coupling_number_(protein_seq)
# 401.387
Calculate all SOCNs across a lag:
#using default parameters: lag=30, distance_matrix="schneider-wrede"
socn_all = protpy.sequence_order_coupling_number(protein_seq)
# SOCN_SW1 SOCN_SW2 SOCN_SW3 SOCN_SW4 SOCN_SW5 ...
# 401.387 409.243 376.946 393.042 396.196 ...
#using custom parameters: lag=10, distance_matrix="grantham"
socn_all = protpy.sequence_order_coupling_number(protein_seq, lag=10, distance_matrix="grantham")
# SOCN_Grant1 SOCN_Grant2 SOCN_Grant3 ...
# 399.125 402.153 387.820 ...
Calculate Quasi Sequence Order (QSO):
#using default parameters: lag=30, weight=0.1, distance_matrix="schneider-wrede"
qso = protpy.quasi_sequence_order(protein_seq)
# QSO_SW1 QSO_SW2 QSO_SW3 QSO_SW4 QSO_SW5 ...
# 0.005692 0.002643 0.004947 0.002846 0.005625 ...
#using custom parameters: lag=10, weight=0.2, distance_matrix="grantham"
qso = protpy.quasi_sequence_order(protein_seq, lag=10, weight=0.2, distance_matrix="grantham")
# QSO_Grant1 QSO_Grant2 QSO_Grant3 ...
# 0.123287 0.079967 0.04332 ...
Note
A demo of the software is available here.