command.txt

#A supporting file for 
#Structure-based virtual screening to identify inhibitors 
#against N-Terminal Acetyltransferase NatB using natural product compounds library

#author:yanhong hong

####################################################
#                                                  #
#      download compounds library from ZINC        #
#                                                  #
####################################################

#Because the download link for all natural-products is broken.
#(https://zinc15.docking.org/substances/subsets/natural-products.mol2?count=all)
#I tried to download them by downloading all biogenic compounds from different catalogs 
#and using the natural product list to filter it.

#command#1,2,3:download all biogenic compounds from different catalogs and the natural product list.

for i in {msnp,afronp,acdiscnp,biofacquimnp,acdiscnd,acdiscnpbb,asternp,biopurify,\
hmdbtoxin,indofinenp,molportnp,targetmolnp,nubbenp,himnp,hitnp,specsnp,npactnp,tcmnp\
,uefsnp,timtecnd};do (nohup curl -o $i.mol2 https://zinc15.docking.org/catalogs/$i/protomers.mol2?count=all &);done

for i in {msnp,afronp,acdiscnp,biofacquimnp,acdiscnd,acdiscnpbb,asternp,biopurify,\
hmdbtoxin,indofinenp,molportnp,targetmolnp,nubbenp,himnp,hitnp,specsnp,npactnp,tcmnp\
,uefsnp,timtecnd};do mkdir $i|mv $i.mol2 $i;done 

wget -O np.txt https://zinc15.docking.org/substances/subsets/natural-products.txt?count=all

cat np.txt|cut -f1|sort > sorted_cutted_np.txt

#command#4:use a script(split_rename_filter.sh) to split mol2 file,rename the splited files and filter them.
#The script can be found in the supplementary files.

for i in {msnp,afronp,acdiscnp,biofacquimnp,acdiscnd,acdiscnpbb,asternp,biopurify,\
hmdbtoxin,indofinenp,molportnp,targetmolnp,nubbenp,himnp,hitnp,specsnp,npactnp,tcmnp\
,uefsnp,timtecnd};do (bash split_rename_filter.sh $i &);done

#command#5:Integrate all the files
mkdir all|cp */mol2_final/* all &

####################################################
#                                                  #
#                    DOCKING                       #
#                                                  #
####################################################

#The original pdb file(natB.pdb)contains a ligand(a short peptide),a cofactor(coA),and a protein.
#use MOE to remove the ligand and the cofactor and save the receptor as receptor.pdb.


#Note that the autodock vina is not supported for parallel running,
#so we need to divide them into different groups,and run them parallelly. 
#command#7,8,9:
for f in all/*; do d=part_$(printf %03d $((i/10000+1))); mkdir -p $d;mv "$f" $d;let i++;done &
mkdir dock
cd dock

#WE USE A SCRIPT FOR PREPARE LIGANDS AND START DOCKING.

#The script is provided in the supplementary materials.(docking.sh)

#The config file(conf.txt) is generated using ADT.

#command#10:
for i in {1..19};do (bash docking.sh $i &);done


####################################################
#                                                  #
#                    RESCORING                     #
#                                                  #
####################################################

#1
#USE SMINA TO RESCORE THE POSES GENERATED BY AUTODOCK VINA.
#All poses are in the allresult directory.All files are generated using vina_split.
#Scoring functions include vinardo,dkoes_scoring,dkoes_fast,ad4_scoring

#command#10:set up the environment
mkdir -p rescorelog/vinardo rescorelog/dkoes_scoring rescorelog/dkoes_fast rescorelog/ad4_scoring

#command#11:
nohup sh -c 'for i in allresult/*;do smina --scoring dkoes_scoring --score_only -r receptor.pdbqt -l $i --log rescorelog/dkoes_scoring/`basename $i .pdbqt`.log;done' > nohup_dkoesscoring & 

#command#12:
nohup sh -c 'for i in allresult/*;do smina --scoring vinardo --score_only -r receptor.pdbqt -l $i --log rescorelog/vinardo/`basename $i .pdbqt`.log;done' > nohup_vinardo &

#command#13:
 nohup sh -c 'for i in allresult/*;do smina --scoring dkoes_fast --score_only -r receptor.pdbqt -l $i --log rescorelog/dkoes_fast/`basename $i .pdbqt`.log;done' > nohup_dkoesfast &

#command#14:
nohup sh -c 'for i in allresult/*;do smina --scoring ad4_scoring --score_only -r receptor.pdbqt -l $i --log rescorelog/ad4_scoring/`basename $i .pdbqt`.log;done' > nohup_ad4scoring &


#2
#USE NIB (negative image based) to rescore the poses.
#We use panther to generate the negative image and use shaep to compare the similarity between the negative image and poses.

#first,add hydrogens to the receptor.
#command#15:
./reduce -BUILD natb.pdb > natb_H.pdb

#Because our ligand is a peptide,We need to change the forth column to ZIN.
#...

#generate the default input parameter file default.in using mkdef option in panther.
#command#16:
./panther.py -mkdef default.in

#generate the model.
#command#17:
./panther.py -pfil natb_H.pdb -bmp F-2 default.in model.mol2 &

#NIB requires mol2 file format for ligands,We convert all pdbqt files to mol2 files using openbabel and integrate them into a single file all.mol2.

#START RESCORING
#command#18:
./shaep model.mol2 all.mol2 --output-file shaep_rescore_model.txt -s rescored_model.sdf --noOptimization &

#3
#sort all rescoring results and get intersection between all top0.1% results.

#e.g default
#command#19:
for i in default/*;do cat $i|egrep "Affinity:" |awk -v a=`basename $i .log` '{print a " " $2}';done > defaultlist &
#command#20:
sort -k2n defaultlist |head -n 1680|cut -d" " -f1|sort > sorted_defaulttop0.1%list

#NIB
cat shaep_rescore_model-I.txt|awk 'NR>1'|awk '{print $1"\t" $2}'|sort -k2n |tail -n 1680|cut -f1|sort > sorted_defaulttop0.1%list

#take the intersection between different groups

#command#20:
comm -1 -2 sorted_ad4scoringtop0.1% sorted_defaultlist0.1% |comm -1 -2 - sorted_dkoes_fastlist0.1% |comm -1 -2 - \ sorted_dkoes_scoringlist0.1% |comm -1 -2 - sorted_vinardolist0.1% |comm -1 -2 - sorted_vinatop0.1% |comm -1 -2 - \
sorted_NIB_rescoretop0.1% > commlist &


####################################################
#                                                  #
#                  VISUALIZATION                   #
#                                                  #
####################################################

#We visualize the chemical space using matplotlib,the commands are provided in supplementary materials(chemical_space_by_pca.ipynb)


####################################################
#                                                  #
#                MOLECULAR DYNAMICS                #
#                                                  #
####################################################

#We use GROMACS to do molecular dynamics,which is for validating the docking results.

#The original protein structure miss some residues,so we need to reconstruct the protein structure using swiss modeling.
#the resulting file is final_receptor.pdb.

#e.g ZINC000150350042

#1
#command#21:pdb2gmx
#force-field:ATB-specific 54a7 force field.
gmx_mpi pdb2gmx -f final_receptor.pdb -o receptor.gro -water spc -ignh


#2:We use ATB to prepare the cofactor/ligand's topology file and coodinate file.
#The resulting files are 1NFP.itp,1NFP.pdb,EM94.itp,EM94.pdb.

#command#22,23:convert pdb to gro
gmx_mpi editconf -f EM94.pdb -o EM94.gro
gmx_mpi editconf -f 1NFP.pdb -o 1NFP.gro

#Integrate the coodinate information to the receptor and rename as complex.gro.
#Include the topology information to the topol.top.

#command#24,25:construct a dodecahedron box and solvate it.
gmx_mpi editconf -f complex.gro -o newbox.gro -bt dodecahedron -d 1.0

gmx_mpi solvate -cp newbox.gro -cs spc216.gro -p topol.top -o solv.gro

#3
#Add ions to neutralize the system.
#ions.mdp is provided in the supplementary files.
#command#26,27:
gmx_mpi grompp -f ions.mdp -c solv.gro -p topol.top -o ions.tpr
gmx_mpi genion -s ions.tpr -o solv_ions.gro -p topol.top -pname NA -nname CL -neutral

#4
#Energy minimization(EM)
#WHY EM?  -A:to relax the system to ensure that steric clashes or inappropriate geometry.
#command#28,29:
#create the binary file
gmx_mpi grompp -f em.mdp -c solv_ions.gro -p topol.top -o em.tpr
#start EM.
nohup gmx_mpi mdrun -v -deffnm em -nb gpu -ntomp 60 > nohupem.out &

#5
#Restraint the ligand and cofactor
#command#30:create a cofactor index file.
gmx_mpi make_ndx -f ligand/coa/coa_orig.gro -o index_1NFP.ndx
>0 & ! a H*
>q

#command#31:generate the cofactor restraint file
gmx_mpi genrestr -f ligand/coa/coa_orig.gro -n index_1NFP.ndx -o posre_1NFP.itp -fc 1000 1000 1000

#command#32:create a ligand index file.
gmx_mpi make_ndx -f ligand/zinc/EM94.gro  -o index_EM94.ndx
>0 & ! a H*
>q

#command#33:generate the ligand restraint file
gmx_mpi genrestr -f ligand/zinc/EM94.gro -n index_EM94.ndx -o poses_EM94.itp -fc 1000 1000 1000

#Integrate the ligand restraint information to the topol.top

#6
#Thermostats
#command#34,35,36:
#nvt.mdp is provided in supplementary file.

gmx_mpi make_ndx -f em.gro -o index.ndx
>1|13|14
>q

gmx_mpi grompp -f nvt.mdp -c em.gro -r em.gro -p topol.top -n index.ndx -o nvt.tpr

nohup gmx_mpi mdrun -deffnm nvt -v -nb gpu -ntomp 60 >nohupnvt.out &

#7
#NPT
#npt.mdp is provided in supplementary file.
#command#37:
gmx grompp -f npt.mdp -c nvt.gro -t nvt.cpt -r nvt.gro -p topol.top -n index.ndx -o npt.tpr
#command#38:
nohup gmx_mpi mdrun -deffnm npt -v -nb gpu -ntomp 60 > nohupnpt.out &

#8
#START MD.
#md.mdp is provided in supplementary files.
#command#39,40:
gmx_mpi grompp -f md.mdp -c npt.gro -t npt.cpt -p topol.top -n index.ndx -o md_0_50.tpr

nohup gmx_mpi mdrun -deffnm md_0_50 -v -nb gpu -ntomp 60 > nohupmd.out &

####################################################
#                                                  #
#                MD results analysis               #
#                                                  #
####################################################

#Recentering and Rewrapping Coordinates
#In this case,the system is a protein-protein-ligand-cofactor complex,it needs a index file to recentering the system.

#Add chain A to index.ndx manually.

#command#41:
gmx_mpi make_ndx -f em.gro -n index.ndx
>0 & ri 12-17                 # 0 is chainA
>name 26 center
>q


#command#42:
gmx_mpi trjconv -s md_0_50.tpr -f md_0_50.xtc -o md_0_50_center.xtc -n index.ndx -center -pbc mol -ur compact
>26
>1           #1 is system.
>q


#RMSD calculation
#command#43,44

gmx_mpi make_ndx -f em.gro -n index.ndx
>21 & !a H*
>name 27 EM94_heavy
>q

gmx_mpi rms -s em.tpr -f md_0_50_center.xtc -n index.ndx -tu ns -o rmsd.xvg
>5   #backbond
>27   #EM94_heavy
>q


#Number of hydrogen bonds.


#Protein-Ligand Interaction Energy

#ie.mdp is provided in supplementary files.

#command#45
gmx_mpi grompp -f ie.mdp -c npt.gro -t npt.cpt -p topol.top -n index.ndx -o ie.tpr

gmx_mpi mdrun -deffnm ie -rerun md_0_50.xtc -nb cpu

gmx_mpi energy -f ie.edr -o interaction_energy.xvg
>Coul-SR:Protein-EM94
>LJ-SR:Protein-EM94


#The free energy landscape is plotted by matplotlib(free_energy_landscape.ipynb)

#The commands below is to perform pca analysis and generate the free_energy_landscape.txt file for plotting.

#command#46,47
	
gmx_mpi trjconv -f md_0_50.xtc -s  md_0_50.tpr -fit rot+trans -o mdfit.xtc

gmx_mpi trjconv -s md_0_50.tpr -f mdfit.xtc -b 0 -e 0 -o md.gro

#command#48
#covariance matrix
gmx_mpi covar -s md.gro -f mdfit.xtc -o eigenvalues.xvg -v eigenvectors.trr -xpma covapic.xpm

#command#49,50,51
#projection	
gmx_mpi anaeig -f mdfit.xtc -s md.gro -v eigenvectors.trr -last 1 -proj pc1.xvg
gmx_mpi anaeig -f mdfit.xtc -s md.gro -v eigenvectors.trr -frist 2 -last 2 -proj pc2.xvg

perl sham.pl -i1 pc1.xvg -i2 pc2.xvg -data 1 -data2 1 -o gsham_input.xvg

#command#51
#calculation of gibbs free energy
#xpm2txt.py is provided in supplementary files.
gmx_mpi sham -f gsham_input.xvg
python2 xpm2txt.py -f FES.xpm -o free-energy-landscape.txt