-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathesm.py
100 lines (86 loc) · 3.29 KB
/
esm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import streamlit as st
import requests
import urllib3
import py3Dmol
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
url = "https://api.esmatlas.com/foldSequence/v1/pdb/"
def get_pdb(sequence):
response = requests.post(
url,
data=sequence,
verify=False
)
return response.text
def read_fasta(file):
sequence = ""
for line in file:
line = line.decode("utf-8").strip()
if line.startswith(">"):
continue
sequence += line
return sequence
def visualize_pdb(pdb_data):
view = py3Dmol.view(width=800, height=800)
view.addModel(pdb_data, 'pdb')
view.setStyle({'cartoon': {'color': 'spectrum'}})
view.spin()
view.zoomTo()
return view
def extract_plddt(pdb_data):
plddt_scores = []
for line in pdb_data.splitlines():
if line.startswith("ATOM"):
plddt = float(line[60:66].strip())
plddt_scores.append(plddt)
return plddt_scores
st.set_page_config(page_title='ESMFold Protein Structure Predictor', page_icon='🎈')
st.sidebar.title('🎈 ESMFold')
st.sidebar.write('[*ESMFold*](https://esmatlas.com/about) is an end-to-end single sequence protein structure predictor based on the ESM-2 language model. For more information, read the [research article](https://www.biorxiv.org/content/10.1101/2022.07.20.500902v2) and the [news article](https://www.nature.com/articles/d41586-022-03539-1) published in *Nature*.')
st.sidebar.subheader('Protein Sequence Format')
st.sidebar.write("""
The protein sequence should be provided as a string of amino acid residues using the standard one-letter amino acid codes.
- A = Alanine
- C = Cysteine
- D = Aspartic acid
- E = Glutamic acid
- F = Phenylalanine
- G = Glycine
- H = Histidine
- I = Isoleucine
- K = Lysine
- L = Leucine
- M = Methionine
- N = Asparagine
- P = Proline
- Q = Glutamine
- R = Arginine
- S = Serine
- T = Threonine
- V = Valine
- W = Tryptophan
- Y = Tyrosine
""")
input_sequence = st.sidebar.text_area("Protein Sequence", height=200)
uploaded_file = st.sidebar.file_uploader("Upload FASTA file", type=["fasta", "fa"])
if st.sidebar.button("Predict Structure"):
if input_sequence or uploaded_file:
st.sidebar.write("Getting your structure...")
if uploaded_file:
input_sequence = read_fasta(uploaded_file)
pdb_data = get_pdb(input_sequence)
st.sidebar.download_button(
label="Download PDB File",
data=pdb_data,
file_name="predicted_structure.pdb",
mime="chemical/x-pdb"
)
st.subheader("3D Visualization of the Predicted Protein Structure")
view = visualize_pdb(pdb_data)
view_html = view._make_html()
st.components.v1.html(view_html, width=800, height=800)
st.subheader("pLDDT Scores")
st.write("The predicted Local Distance Difference Test (pLDDT) score is a confidence measure for the predicted structure, ranging from 0 to 1. Higher scores indicate greater confidence in the accuracy of the predicted structure.")
plddt_scores = extract_plddt(pdb_data)
st.info(f"Average pLDDT score: {sum(plddt_scores) / len(plddt_scores):.2f}")
else:
st.sidebar.error("Please enter a protein sequence or upload a FASTA file.")