Source code for simbad.rotsearch.phaser_search

"""Module to run the phaser rotation search"""

__author__ = "Adam Simpkin & Felix Simkovic"
__date__ = "15 April 2018"
__version__ = "0.4"

import logging
import os
import shutil
import uuid
logger = logging.getLogger(__name__)

import pyjob.misc

import simbad.db
import simbad.mr
import simbad.rotsearch
import simbad.core.dat_score
import simbad.core.phaser_score
import simbad.parsers.refmac_parser
import simbad.parsers.rotsearch_parser
import simbad.util.pdb_util
import simbad.util.mtz_util
import simbad.util.matthews_prob

from phaser import InputMR_DAT, runMR_DAT, InputCCA, runCCA
from simbad.util import EXPORT, CMD_PREFIX


[docs]class PhaserRotationSearch(object): """A class to perform the phaser rotation search Attributes ---------- mtz : str The path to the input MTZ i : str Column label for I sigi : str Column label for SIGI work_dir : str The path to the working directory max_to_keep : int The maximum number of results to keep [default: 20] Examples -------- >>> from simbad.rotsearch.phaser_search import PhaserRotationSearch >>> rotation_search = PhaserRotationSearch('<mtz>', '<mr_program>', '<tmp_dir>', '<work_dir>', '<max_to_keep>', ... '<skip_mr>') >>> rotation_search.run( ... '<models_dir>', '<nproc>', '<min_solvent_content>', '<submit_qtype>', ... '<submit_queue>', '<monitor>', '<chunk_size>' ... ) >>> rotation_search.summarize() >>> search_results = rotation_search.search_results If any results are found, an object is returned containing the pdb_code, and the various associated scores from phaser. """ def __init__(self, mtz, mr_program, tmp_dir, work_dir, max_to_keep=20, skip_mr=False, **kwargs): self.max_to_keep = max_to_keep self.mr_program = mr_program self.mtz = mtz self.tmp_dir = tmp_dir self.work_dir = work_dir self.skip_mr = skip_mr self.f = None self.sigf = None self.i = None self.sigi = None self.simbad_dat_files = None self.submit_qtype = None self.submit_queue = None self._search_results = None self.tested = []
[docs] def run(self, models_dir, nproc=2, min_solvent_content=20, submit_qtype=None, submit_queue=None, monitor=None, chunk_size=0, **kwargs): """Run phaser rotation function on a directory of models Parameters ---------- models_dir : str The directory containing the models to run the rotation search on nproc : int, optional The number of processors to run the job on min_solvent_content : int, float, optional The minimum solvent content present in the unit cell with the input model [default: 30] submit_qtype : str The cluster submission queue type - currently support SGE and LSF submit_queue : str The queue to submit to on the cluster monitor chunk_size : int, optional The number of jobs to submit at the same time Returns ------- file log file for each model in the models_dir """ self.submit_qtype = submit_qtype self.submit_queue = submit_queue self.f, self.sigf, self.i, self.sigi, _, _, _ = simbad.util.mtz_util.get_labels(self.mtz) self.simbad_dat_files = simbad.db.find_simbad_dat_files(models_dir) n_files = len(self.simbad_dat_files) i = InputMR_DAT() i.setHKLI(self.mtz) i.setMUTE(True) run_mr_data = runMR_DAT(i) sg = run_mr_data.getSpaceGroupName().replace(" ", "") cell = " ".join(map(str, run_mr_data.getUnitCell())) chunk_size = simbad.rotsearch.get_chunk_size(n_files, chunk_size) total_chunk_cycles = simbad.rotsearch.get_total_chunk_cycles(n_files, chunk_size) mat_coef = simbad.util.matthews_prob.MatthewsProbability(cell, sg) dir_name = "simbad-tmp-" + str(uuid.uuid1()) script_log_dir = os.path.join(self.work_dir, dir_name) os.mkdir(script_log_dir) ccp4_scr = os.environ["CCP4_SCR"] default_tmp_dir = os.path.join(self.work_dir, 'tmp') if self.tmp_dir: template_tmp_dir = os.path.join(self.tmp_dir, dir_name + "-{0}") else: template_tmp_dir = os.path.join(default_tmp_dir, dir_name + "-{0}") predicted_molecular_weight = 0 if run_mr_data.Success(): i = InputCCA() i.setSPAC_HALL(run_mr_data.getSpaceGroupHall()) i.setCELL6(run_mr_data.getUnitCell()) i.setMUTE(True) run_cca = runCCA(i) if run_cca.Success(): predicted_molecular_weight = run_cca.getAssemblyMW() dat_models = [] for dat_model in self.simbad_dat_files: name = os.path.basename(dat_model.replace(".dat", "")) pdb_struct = simbad.util.pdb_util.PdbStructure() pdb_struct.from_file(dat_model) solvent_fraction, n_copies = mat_coef.calculate_content_ncopies_from_struct(pdb_struct) solvent_content = solvent_fraction * 100 if solvent_content < min_solvent_content: msg = "Skipping %s: solvent content is predicted to be less than %.2f" logger.debug(msg, name, min_solvent_content) continue model_molecular_weight = pdb_struct.molecular_weight mw_diff = abs(predicted_molecular_weight - model_molecular_weight) info = simbad.core.dat_score.DatModelScore( name, dat_model, mw_diff, None, None, None, None, solvent_fraction, n_copies ) dat_models.append(info) sorted_dat_models = sorted(dat_models, key=lambda x: float(x.mw_diff), reverse=False) iteration_range = range(0, n_files, chunk_size) for cycle, i in enumerate(iteration_range): logger.info("Working on chunk %d out of %d", cycle + 1, total_chunk_cycles) template_model = os.path.join("$CCP4_SCR", "{0}.pdb") phaser_files = [] for dat_model in sorted_dat_models[i:i + chunk_size]: logger.debug("Generating script to perform PHASER rotation " + "function on %s", dat_model.pdb_code) pdb_model = template_model.format(dat_model.pdb_code) template_rot_log = os.path.join("$CCP4_SCR", "{0}_rot.log") conv_py = "\"from simbad.db import convert_dat_to_pdb; convert_dat_to_pdb('{}', '{}')\"" conv_py = conv_py.format(dat_model.dat_path, pdb_model) rot_log = template_rot_log.format(dat_model.pdb_code) tmp_dir = template_tmp_dir.format(dat_model.pdb_code) phaser_cmd = ["simbad.rotsearch.phaser_rotation_search", "-hklin", self.mtz, "-f", self.f, "-sigf", self.sigf, "-i", self.i, "-sigi", self.sigi, "-pdbin", pdb_model, "-logfile", rot_log, "-solvent", dat_model.solvent, "-nmol", dat_model.nmol, "-work_dir", tmp_dir ] phaser_cmd = " ".join(str(e) for e in phaser_cmd) cmd = [ [EXPORT, "CCP4_SCR=" + tmp_dir], ["mkdir", "-p", "$CCP4_SCR\n"], [CMD_PREFIX, "$CCP4/bin/ccp4-python", "-c", conv_py, os.linesep], [CMD_PREFIX, "$CCP4/bin/ccp4-python", "-m", phaser_cmd, os.linesep], ["rm", "-rf", "$CCP4_SCR\n"], [EXPORT, "CCP4_SCR=" + ccp4_scr], ] phaser_script = pyjob.misc.make_script( cmd, directory=script_log_dir, prefix="phaser_", stem=dat_model.pdb_code ) phaser_log = phaser_script.rsplit(".", 1)[0] + '.log' phaser_files += [(phaser_script, phaser_log, dat_model.dat_path)] results = [] if len(phaser_files) > 0: logger.info("Running PHASER rotation functions") phaser_scripts, phaser_logs, dat_models = zip(*phaser_files) simbad.rotsearch.submit_chunk(phaser_scripts, script_log_dir, nproc, 'simbad_phaser', submit_qtype, submit_queue, monitor, self.rot_succeeded_log) for dat_model, phaser_log in zip(dat_models, phaser_logs): base = os.path.basename(phaser_log) pdb_code = base.replace("phaser_", "").replace(".log", "") phaser_rotation_parser = simbad.parsers.rotsearch_parser.PhaserRotsearchParser( phaser_log ) score = simbad.core.phaser_score.PhaserRotationScore(pdb_code, dat_model, phaser_rotation_parser.llg, phaser_rotation_parser.rfz) if phaser_rotation_parser.rfz: results += [score] else: logger.critical("No structures to be trialled") self._search_results = results shutil.rmtree(script_log_dir) if os.path.isdir(default_tmp_dir): shutil.rmtree(default_tmp_dir)
[docs] def summarize(self, csv_file): """Summarize the search results Parameters ---------- csv_file : str The path for a backup CSV file Raises ------ No results found """ from simbad.util import summarize_result columns = [ "llg", "rfz" ] summarize_result(self.search_results, csv_file=csv_file, columns=columns)
@property def search_results(self): return sorted(self._search_results, key=lambda x: float(x.llg), reverse=True)[:self.max_to_keep] @staticmethod def _rot_job_succeeded(phaser_llg_score): """Check values for job success""" return phaser_llg_score > 100
[docs] def rot_succeeded_log(self, log): """Check a rotation search job for it's success Parameters ---------- log : str The path to a log file Returns ------- bool Success status of the rot run """ if self.skip_mr: return False rot_prog, pdb = os.path.basename(log).replace('.log', '').split('_', 1) rotsearch_parser = simbad.parsers.rotsearch_parser.PhaserRotsearchParser(log) dat_model = [s for s in self.simbad_dat_files if pdb in s][0] score = simbad.core.phaser_score.PhaserRotationScore( pdb, dat_model, rotsearch_parser.llg, rotsearch_parser.rfz ) results = [score] if self._rot_job_succeeded(rotsearch_parser.llg) and pdb not in self.tested: self.tested.append(pdb) output_dir = os.path.join(self.work_dir, "mr_search") mr = simbad.mr.MrSubmit(mtz=self.mtz, mr_program=self.mr_program, refine_program='refmac5', refine_type=None, refine_cycles=0, output_dir=output_dir, tmp_dir=self.tmp_dir, timeout=30) mr.mute = True mr.submit_jobs(results, nproc=1, process_all=True, submit_qtype=self.submit_qtype, submit_queue=self.submit_queue) refmac_log = os.path.join(output_dir, pdb, "mr", self.mr_program, "refine", pdb + "_ref.log") if os.path.isfile(refmac_log): refmac_parser = simbad.parsers.refmac_parser.RefmacParser(refmac_log) return simbad.rotsearch.mr_job_succeeded(refmac_parser.final_r_fact, refmac_parser.final_r_free) return False