Source code for simbad.mr

"""Class to run MR on SIMBAD results"""

from __future__ import division

__author__ = "Adam Simpkin"
__date__ = "09 Mar 2017"
__version__ = "1.0"

import logging
import numpy
import os

from pyjob import Job, cexec
from pyjob.misc import make_script, tmp_file

from simbad.mr import anomalous_util
from simbad.parsers import molrep_parser
from simbad.parsers import phaser_parser
from simbad.parsers import refmac_parser
from simbad.util.pdb_util import PdbStructure
from simbad.util.matthews_prob import MatthewsProbability, SolventContent
from simbad.util import mtz_util

from simbad.core.lattice_score import LatticeSearchResult
from simbad.core.amore_score import AmoreRotationScore
from simbad.core.phaser_score import PhaserRotationScore
from simbad.core.mr_score import MrScore

logger = logging.getLogger(__name__)

EXPORT = "SET" if os.name == "nt" else "export"
CMD_PREFIX = "call" if os.name == "nt" else ""

# Make clear which binaries we currently support
KNOWN_MR_PROGRAMS = ["molrep", "phaser"]
KNOWN_REF_PROGRAMS = ["refmac5"]


[docs]class MrSubmit(object): """Class to run MR on a defined set of models Attributes ---------- mtz : str Path to the input MTZ file mr_program : str Name of the molecular replacement program to use refine_program : str Name of the refinement program to use refine_type : str Type of refinement to run (None | jelly) refine_cycles : int The number of refinement cycles (default: 30) output_dir : str Path to the directory to output results sgalternative : str Specify whether to try alternative space groups (all | enant) results : obj Results from :obj: '_LatticeParameterScore' or :obj: '_AmoreRotationScore' Examples -------- >>> from simbad.mr import MrSubmit >>> MR = MrSubmit('<mtz>', '<mr_program>', '<refine_program>', '<refine_type>', '<output_dir>', '<sgalternative>') >>> MR.submit_jobs('<results>', '<nproc>', '<submit_cluster>', '<submit_qtype>', '<submit_queue>', ... '<submit_array>', '<submit_max_array>', '<process_all>', '<monitor>') If a solution is found and process_all is not set, the queued jobs will be terminated. """ def __init__(self, mtz, mr_program, refine_program, refine_type, refine_cycles, output_dir, tmp_dir, timeout, sgalternative=None): """Initialise MrSubmit class""" self.input_file = None self._process_all = None self._sgalternative = None self._mtz = None self._mr_program = None self._output_dir = None self._refine_program = None self._refine_type = None self._refine_cycles = None self._search_results = [] self._timeout = None # options derived from the input mtz self._cell_parameters = None self._resolution = None self._space_group = None self._f = None self._sigf = None self._i = None self._sigi = None self._dano = None self._sigdano = None self._free = None self.sgalternative = sgalternative self.mtz = mtz self.mr_program = mr_program self.mute = False self.output_dir = output_dir self.refine_program = refine_program self.refine_type = refine_type self.refine_cycles = refine_cycles self.tmp_dir = tmp_dir self.timeout = timeout @property def mtz(self): """The input MTZ file""" return self._mtz @mtz.setter def mtz(self, mtz): """Define the input MTZ file""" self._mtz = os.path.abspath(mtz) self.get_mtz_info(mtz) @property def cell_parameters(self): """The cell parameters of the input MTZ file""" return self._cell_parameters @property def resolution(self): """The resolution of the input MTZ file""" return self._resolution @property def search_results(self): """The results from the amore rotation search""" return sorted(self._search_results, key=lambda x: float(x.final_r_free), reverse=False) @property def sgalternative(self): """Whether to check for alternative space groups""" return self._sgalternative @sgalternative.setter def sgalternative(self, sgalternative): """Define whether to check for alternative space groups""" if sgalternative: self._sgalternative = sgalternative.lower() else: self._sgalternative = sgalternative @property def space_group(self): """The space group of the input MTZ file""" return self._space_group @property def f(self): """The F column label of the input MTZ file""" return self._f @property def sigf(self): """The SIGF column label of the input MTZ file""" return self._sigf @property def i(self): """The I column label of the input MTZ file""" return self._i @property def sigi(self): """The SIGI column label of the input MTZ file""" return self._sigi @property def free(self): """The FREE column label of the input MTZ file""" return self._free @property def mr_python_module(self): """The MR python module""" if self.mr_program == "molrep": return "simbad.mr.molrep_mr" elif self.mr_program == "phaser": return "simbad.mr.phaser_mr" @property def mr_program(self): """The molecular replacement program to use""" return self._mr_program @mr_program.setter def mr_program(self, mr_program): """Define the molecular replacement program to use""" if mr_program.lower() in KNOWN_MR_PROGRAMS: self._mr_program = mr_program.lower() else: msg = "Unknown MR program!" raise RuntimeError(msg) @property def refine_python_module(self): """The Refinement python module""" if self.refine_program == "refmac5": return "simbad.mr.refmac_refine" @property def refine_program(self): """The refinement program to use""" return self._refine_program @refine_program.setter def refine_program(self, refine_program): """Define the refinement program to use""" if refine_program.lower() in KNOWN_REF_PROGRAMS: self._refine_program = refine_program else: msg = "Unknown Refinement program!" raise RuntimeError(msg) @property def refine_type(self): """The refinement type to use""" return self._refine_type @refine_type.setter def refine_type(self, refine_type): """Define the refinement type to use""" self._refine_type = refine_type @property def refine_cycles(self): """The number of refinement cycles to use""" return self._refine_cycles @refine_cycles.setter def refine_cycles(self, refine_cycles): """Define the number of refinement cycles to use""" self._refine_cycles = refine_cycles @property def output_dir(self): """The path to the output directory""" return self._output_dir @output_dir.setter def output_dir(self, output_dir): """Define the output directory""" self._output_dir = output_dir @property def timeout(self): """The time in minutes before phaser is killed""" return self._timeout @timeout.setter def timeout(self, timeout): """Define the time in minutes before phaser should be killed""" self._timeout = timeout
[docs] def get_mtz_info(self, mtz): """Get various information from the input MTZ Parameters ---------- mtz : str Path to the input MTZ Returns ------- self._cell_parameters : list The parameters that descibe the unit cell self._resolution : float The resolution of the data self._space_group : str The space group of the data self._f : str The column label for F self._sigf : str The column label for SIGF self._free : str The column label for FREE self._solvent : float The predicted solvent content of the protein """ # Extract crystal data from input mtz self._space_group, _, cell_parameters = mtz_util.crystal_data(mtz) self._cell_parameters = " ".join(map(str, cell_parameters)) # Extract column labels from input mtz self._f, self._sigf, self._i, self._sigi, self._dano, self._sigdano, self._free = mtz_util.get_labels( mtz)
[docs] def submit_jobs(self, results, nproc=1, process_all=False, submit_qtype=None, submit_queue=False, monitor=None): """Submit jobs to run in serial or on a cluster Parameters ---------- results : class Results from :obj: '_LatticeParameterScore' or :obj: '_AmoreRotationScore' nproc : int, optional Number of processors to use [default: 1] process_all : bool, optional Terminate MR after a success [default: True] submit_qtype : str The cluster submission queue type - currently support SGE and LSF submit_queue : str The queue to submit to on the cluster monitor : str Returns ------- file Output pdb from mr file Output hkl from mr - if using phaser file Output log file from mr program file Output pdb from refinement file Output hkl from refinement file Output log file from refinement program """ if not os.path.isdir(self.output_dir): os.mkdir(self.output_dir) run_files = [] sol_cont = SolventContent(self.cell_parameters, self.space_group) mat_prob = MatthewsProbability(self.cell_parameters, self.space_group) for result in results: mr_workdir = os.path.join(self.output_dir, result.pdb_code, 'mr', self.mr_program) mr_logfile = os.path.join(mr_workdir, '{0}_mr.log'.format(result.pdb_code)) mr_pdbout = os.path.join(mr_workdir, '{0}_mr_output.pdb'.format(result.pdb_code)) ref_workdir = os.path.join(mr_workdir, 'refine') ref_hklout = os.path.join(ref_workdir, '{0}_refinement_output.mtz'.format(result.pdb_code)) ref_logfile = os.path.join(ref_workdir, '{0}_ref.log'.format(result.pdb_code)) ref_pdbout = os.path.join(ref_workdir, '{0}_refinement_output.pdb'.format(result.pdb_code)) diff_mapout1 = os.path.join(ref_workdir, '{0}_refmac_2fofcwt.map'.format(result.pdb_code)) diff_mapout2 = os.path.join(ref_workdir, '{0}_refmac_fofcwt.map'.format(result.pdb_code)) if os.path.isfile(ref_logfile): rp = refmac_parser.RefmacParser(ref_logfile) if _mr_job_succeeded(rp.final_r_fact, rp.final_r_free): score = MrScore(pdb_code=result.pdb_code) if self.mr_program == "molrep": mp = molrep_parser.MolrepParser(mr_logfile) score.molrep_score = mp.score score.molrep_tfscore = mp.tfscore elif self.mr_program == "phaser": pp = phaser_parser.PhaserParser(mr_logfile) score.phaser_tfz = pp.tfz score.phaser_llg = pp.llg score.phaser_rfz = pp.rfz rp = refmac_parser.RefmacParser(ref_logfile) score.final_r_free = rp.final_r_free score.final_r_fact = rp.final_r_fact self._search_results = [score] return if isinstance(result, AmoreRotationScore) or isinstance(result, PhaserRotationScore): pdb_struct = PdbStructure() pdb_struct.from_file(result.dat_path) mr_pdbin = os.path.join(self.output_dir, result.pdb_code + ".pdb") pdb_struct.save(mr_pdbin) elif isinstance(result, LatticeSearchResult): pdb_struct = PdbStructure() pdb_struct.from_file(result.pdb_path) mr_pdbin = result.pdb_path else: raise ValueError("Do not recognize result container") solvent_content = sol_cont.calculate_from_struct(pdb_struct) if solvent_content > 30: n_copies = 1 else: pdb_struct.keep_first_chain_only() pdb_struct.save(mr_pdbin) solvent_content, n_copies = mat_prob.calculate_content_ncopies_from_struct(pdb_struct) msg = "%s is predicted to be too large to fit in the unit "\ + "cell with a solvent content of at least 30 percent, "\ + "therefore MR will use only the first chain" logger.debug(msg, result.pdb_code) mr_cmd = [ CMD_PREFIX, "ccp4-python", "-m", self.mr_python_module, "-hklin", self.mtz, "-pdbin", mr_pdbin, "-pdbout", mr_pdbout, "-logfile", mr_logfile, "-work_dir", mr_workdir, "-nmol", n_copies, "-sgalternative", self.sgalternative ] ref_cmd = [ CMD_PREFIX, "ccp4-python", "-m", self.refine_python_module, "-pdbin", mr_pdbout, "-pdbout", ref_pdbout, "-hklin", self.mtz, "-hklout", ref_hklout, "-logfile", ref_logfile, "-work_dir", ref_workdir, "-refinement_type", self.refine_type, "-ncyc", self.refine_cycles ] if self.mr_program == "molrep": mr_cmd += ["-space_group", self.space_group] elif self.mr_program == "phaser": mr_cmd += [ "-i", self.i, "-sigi", self.sigi, "-f", self.f, "-sigf", self.sigf, "-solvent", solvent_content, "-timeout", self.timeout, ] if isinstance(result, LatticeSearchResult): mr_cmd += [ '-autohigh', 4.0, '-hires', 5.0 ] # ==== # Create a run script - prefix __needs__ to contain mr_program so we can find log # Leave order of this as SGE does not like scripts with numbers as first char # ==== prefix, stem = self.mr_program + "_", result.pdb_code fft_cmd1, fft_stdin1 = self.fft(ref_hklout, diff_mapout1, "2mfo-dfc") run_stdin_1 = tmp_file(directory=self.output_dir, prefix=prefix, stem=stem, suffix="_1.stdin") with open(run_stdin_1, 'w') as f_out: f_out.write(fft_stdin1) fft_cmd2, fft_stdin2 = self.fft(ref_hklout, diff_mapout2, "mfo-dfc") run_stdin_2 = tmp_file(directory=self.output_dir, prefix=prefix, stem=stem, suffix="_2.stdin") with open(run_stdin_2, 'w') as f_out: f_out.write(fft_stdin2) ccp4_scr = os.environ["CCP4_SCR"] if self.tmp_dir: tmp_dir = os.path.join(self.tmp_dir) else: tmp_dir = os.path.join(self.output_dir) cmd = [ [EXPORT, "CCP4_SCR=" + tmp_dir], mr_cmd + [os.linesep], ref_cmd + [os.linesep], fft_cmd1 + ["<", run_stdin_1, os.linesep], fft_cmd2 + ["<", run_stdin_2, os.linesep], [EXPORT, "CCP4_SCR=" + ccp4_scr], ] run_script = make_script(cmd, directory=self.output_dir, prefix=prefix, stem=stem) run_log = run_script.rsplit(".", 1)[0] + '.log' run_files += [(run_script, run_stdin_1, run_stdin_2, run_log, mr_pdbout, mr_logfile, ref_logfile)] if not self.mute: logger.info("Running %s Molecular Replacement", self.mr_program) run_scripts, _, _, _, mr_pdbouts, mr_logfiles, ref_logfiles = zip( *run_files) j = Job(submit_qtype) j.submit(run_scripts, directory=self.output_dir, nproc=nproc, name='simbad_mr', queue=submit_queue, permit_nonzero=True) interval = int(numpy.log(len(run_scripts)) / 3) interval_in_seconds = interval if interval >= 5 else 5 if process_all: j.wait(interval=interval_in_seconds, monitor=monitor) else: j.wait(interval=interval_in_seconds, monitor=monitor, check_success=mr_succeeded_log) mr_results = [] for result, mr_logfile, mr_pdbout, ref_logfile in zip(results, mr_logfiles, mr_pdbouts, ref_logfiles): if not os.path.isfile(mr_logfile): logger.debug("Cannot find %s MR log file: %s", self.mr_program, mr_logfile) continue elif not os.path.isfile(ref_logfile): logger.debug("Cannot find %s refine log file: %s", self.mr_program, ref_logfile) continue elif not os.path.isfile(mr_pdbout): logger.debug("Cannot find %s output file: %s", self.mr_program, mr_pdbout) continue score = MrScore(pdb_code=result.pdb_code) if self.mr_program == "molrep": mp = molrep_parser.MolrepParser(mr_logfile) score.molrep_score = mp.score score.molrep_tfscore = mp.tfscore elif self.mr_program == "phaser": pp = phaser_parser.PhaserParser(mr_logfile) score.phaser_tfz = pp.tfz score.phaser_llg = pp.llg score.phaser_rfz = pp.rfz if self._dano is not None: try: anode = anomalous_util.AnodeSearch( self.mtz, self.output_dir, self.mr_program) anode.run(result) a = anode.search_results() score.dano_peak_height = a.dano_peak_height score.nearest_atom = a.nearest_atom except RuntimeError: logger.debug( "RuntimeError: Unable to create DANO map for: %s", result.pdb_code) except IOError: logger.debug( "IOError: Unable to create DANO map for: %s", result.pdb_code) if os.path.isfile(ref_logfile): rp = refmac_parser.RefmacParser(ref_logfile) score.final_r_free = rp.final_r_free score.final_r_fact = rp.final_r_fact else: logger.debug("Cannot find %s log file: %s", self.refine_program, ref_logfile) mr_results += [score] self._search_results = mr_results
[docs] @staticmethod def fft(hklin, mapout, map_type): """Function to run fft to generate difference maps for uglymol Parameters ---------- hklin : str Path to input HKL file mapout : str Path to output MAP file map_type : str Define type of run, either mfo-dfc or 2mfo-dfc Returns ------- list cmd str stdin Raises ------ ValueError Unknown map type """ cmd = [CMD_PREFIX, "fft", "hklin", hklin, "mapout", mapout] if map_type == "2mfo-dfc": stdin = "title Sigmaa style 2mfo-dfc map calculated with refmac coefficients" + os.linesep \ + "labi F1=FWT PHI=PHWT" + os.linesep \ + "end" + os.linesep elif map_type == "mfo-dfc": stdin = "title Sigmaa style mfo-dfc map calculated with refmac coefficients" + os.linesep \ + "labi F1=DELFWT PHI=PHDELWT" + os.linesep \ + "end" + os.linesep else: msg = "Unknown map type!" raise ValueError(msg) return cmd, stdin
[docs] def summarize(self, csv_file): """Summarize the search results Parameters ---------- csv_file : str The path for a backup CSV file Raises ------ No results found """ from simbad.util import summarize_result columns = [] if self.mr_program == "molrep": columns += ["molrep_score", "molrep_tfscore"] elif self.mr_program == "phaser": columns += ["phaser_tfz", "phaser_llg", "phaser_rfz"] columns += ["final_r_fact", "final_r_free"] if self._dano: columns += ["dano_peak_height", "nearest_atom"] summarize_result(self.search_results, csv_file=csv_file, columns=columns)
def _mr_job_succeeded(r_fact, r_free): """Check values for job success""" return r_fact < 0.45 and r_free < 0.45
[docs]def mr_succeeded_log(log): """Check a Molecular Replacement job for it's success Parameters ---------- log : str The path to a log file Returns ------- bool Success status of the MR run """ mr_prog, pdb = os.path.basename(log).replace('.log', '').split('_', 1) refmac_log = os.path.join(os.path.dirname( log), pdb, "mr", mr_prog, "refine", pdb + "_ref.log") if os.path.isfile(refmac_log): rp = refmac_parser.RefmacParser(refmac_log) return _mr_job_succeeded(rp.final_r_fact, rp.final_r_free) return False
[docs]def mr_succeeded_csvfile(f): """Check a Molecular Replacement job for it's success Parameters ---------- f : str The path to f Returns ------- bool Success status of the MR run """ import pandas as pd df = pd.read_csv(f) data = zip(df.final_r_fact.tolist(), df.final_r_free.tolist()) return any(_mr_job_succeeded(rfact, rfree) for rfact, rfree in data)