Source code for simbad.rotsearch.amore_search

"""Module to run the amore rotation search"""

__author__ = "Adam Simpkin & Felix Simkovic"
__date__ = "15 April 2018"
__version__ = "0.4"

import logging
import os
import shutil
import uuid
logger = logging.getLogger(__name__)

import pyjob.misc

import simbad.db
import simbad.mr
import simbad.rotsearch
import simbad.core.amore_score
import simbad.core.dat_score
import simbad.parsers.refmac_parser
import simbad.parsers.rotsearch_parser
import simbad.util.pdb_util
import simbad.util.mtz_util
import simbad.util.matthews_prob

from phaser import InputMR_DAT, runMR_DAT, InputCCA, runCCA
from simbad.util import EXPORT, CMD_PREFIX


[docs]class AmoreRotationSearch(object): """A class to perform the amore rotation search Attributes ---------- amore_exe : str The path to the amore executable mtz : str The path to the input MTZ work_dir : str The path to the working directory max_to_keep : int The maximum number of results to keep [default: 20] search_results : list The search results Examples -------- >>> from simbad.rotsearch.amore_search import AmoreRotationSearch >>> rotation_search = AmoreRotationSearch('<amore_exe>', '<mtz>', '<mr_program>', '<tmp_dir>', '<work_dir>', ... '<max_to_keep>') >>> rotation_search.run( ... '<models_dir>', '<output_dir>', '<nproc>', '<shres>', '<pklim>', '<npic>', '<rotastep>', ... '<min_solvent_content>', '<submit_qtype>', '<submit_queue>', '<monitor>', '<chunk_size>' ... ) >>> rotation_search.summarize() >>> search_results = rotation_search.search_results If any results are found, an object is returned containing the pdb_code, and the various associated scores from amore. """ def __init__(self, mtz, mr_program, tmp_dir, work_dir, amore_exe=None, max_to_keep=20, skip_mr=False, **kwargs): self.amore_exe = amore_exe self.max_to_keep = max_to_keep self.mr_program = mr_program self.mtz = mtz self.skip_mr = skip_mr self.tmp_dir = tmp_dir self.work_dir = work_dir self.f = None self.sigf = None self.simbad_dat_files = None self.submit_qtype = None self.submit_queue = None self._search_results = None self.tested = []
[docs] def run(self, models_dir, nproc=2, shres=3.0, pklim=0.5, npic=50, rotastep=1.0, min_solvent_content=20, submit_qtype=None, submit_queue=None, monitor=None, chunk_size=0, **kwargs): """Run amore rotation function on a directory of models Parameters ---------- models_dir : str The directory containing the models to run the rotation search on nproc : int, optional The number of processors to run the job on shres : int, float, optional Spherical harmonic resolution [default 3.0] pklim : int, float, optional Peak limit, output all peaks above <float> [default: 0.5] npic : int, optional Number of peaks to output from the translation function map for each orientation [default: 50] rotastep : int, float, optional Size of rotation step [default : 1.0] min_solvent_content : int, float, optional The minimum solvent content present in the unit cell with the input model [default: 30] submit_qtype : str The cluster submission queue type - currently support SGE and LSF submit_queue : str The queue to submit to on the cluster monitor chunk_size : int, optional The number of jobs to submit at the same time Returns ------- file log file for each model in the models_dir """ self.submit_qtype = submit_qtype self.submit_queue = submit_queue self.simbad_dat_files = simbad.db.find_simbad_dat_files(models_dir) n_files = len(self.simbad_dat_files) i = InputMR_DAT() i.setHKLI(self.mtz) i.setMUTE(True) run_mr_data = runMR_DAT(i) sg = run_mr_data.getSpaceGroupName().replace(" ", "") cell = " ".join(map(str, run_mr_data.getUnitCell())) chunk_size = simbad.rotsearch.get_chunk_size(n_files, chunk_size) total_chunk_cycles = simbad.rotsearch.get_total_chunk_cycles(n_files, chunk_size) sol_calc = simbad.util.matthews_prob.SolventContent(cell, sg) dir_name = "simbad-tmp-" + str(uuid.uuid1()) script_log_dir = os.path.join(self.work_dir, dir_name) os.mkdir(script_log_dir) hklpck0 = self._generate_hklpck0() ccp4_scr = os.environ["CCP4_SCR"] default_tmp_dir = os.path.join(self.work_dir, 'tmp') if self.tmp_dir: template_tmp_dir = os.path.join(self.tmp_dir, dir_name + "-{0}") else: template_tmp_dir = os.path.join(default_tmp_dir, dir_name + "-{0}") template_hklpck1 = os.path.join("$CCP4_SCR", "{0}.hkl") template_clmn0 = os.path.join("$CCP4_SCR", "{0}_spmipch.clmn") template_clmn1 = os.path.join("$CCP4_SCR", "{0}.clmn") template_mapout = os.path.join("$CCP4_SCR", "{0}_amore_cross.map") template_table1 = os.path.join("$CCP4_SCR", "{0}_sfs.tab") template_model = os.path.join("$CCP4_SCR", "{0}.pdb") template_rot_log = os.path.join("$CCP4_SCR", "{0}_rot.log") predicted_molecular_weight = 0 if run_mr_data.Success(): i = InputCCA() i.setSPAC_HALL(run_mr_data.getSpaceGroupHall()) i.setCELL6(run_mr_data.getUnitCell()) i.setMUTE(True) run_cca = runCCA(i) if run_cca.Success(): predicted_molecular_weight = run_cca.getAssemblyMW() dat_models = [] for dat_model in self.simbad_dat_files: name = os.path.basename(dat_model.replace(".dat", "")) pdb_struct = simbad.util.pdb_util.PdbStructure() pdb_struct.from_file(dat_model) solvent_content = sol_calc.calculate_from_struct(pdb_struct) if solvent_content < min_solvent_content: msg = "Skipping %s: solvent content is predicted to be less than %.2f" logger.debug(msg, name, min_solvent_content) continue x, y, z, intrad = pdb_struct.integration_box model_molecular_weight = pdb_struct.molecular_weight mw_diff = abs(predicted_molecular_weight - model_molecular_weight) info = simbad.core.dat_score.DatModelScore( name, dat_model, mw_diff, x, y, z, intrad, solvent_content, None ) dat_models.append(info) sorted_dat_models = sorted(dat_models, key=lambda x: float(x.mw_diff), reverse=False) iteration_range = range(0, n_files, chunk_size) for cycle, i in enumerate(iteration_range): logger.info("Working on chunk %d out of %d", cycle + 1, total_chunk_cycles) amore_files = [] for dat_model in sorted_dat_models[i:i + chunk_size]: logger.debug("Generating script to perform AMORE rotation " + "function on %s", dat_model.pdb_code) pdb_model = template_model.format(dat_model.pdb_code) table1 = template_table1.format(dat_model.pdb_code) hklpck1 = template_hklpck1.format(dat_model.pdb_code) clmn0 = template_clmn0.format(dat_model.pdb_code) clmn1 = template_clmn1.format(dat_model.pdb_code) mapout = template_mapout.format(dat_model.pdb_code) conv_py = "\"from simbad.db import convert_dat_to_pdb; convert_dat_to_pdb('{}', '{}')\"" conv_py = conv_py.format(dat_model.dat_path, pdb_model) tab_cmd = [self.amore_exe, "xyzin1", pdb_model, "xyzout1", pdb_model, "table1", table1] tab_stdin = self.tabfun_stdin_template.format( x=dat_model.x, y=dat_model.y, z=dat_model.z, a=90, b=90, c=120 ) rot_cmd = [self.amore_exe, 'table1', table1, 'HKLPCK1', hklpck1, 'hklpck0', hklpck0, 'clmn1', clmn1, 'clmn0', clmn0, 'MAPOUT', mapout] rot_stdin = self.rotfun_stdin_template.format( shres=shres, intrad=dat_model.intrad, pklim=pklim, npic=npic, step=rotastep ) rot_log = template_rot_log.format(dat_model.pdb_code) tmp_dir = template_tmp_dir.format(dat_model.pdb_code) cmd = [ [EXPORT, "CCP4_SCR=" + tmp_dir], ["mkdir", "-p", "$CCP4_SCR\n"], [CMD_PREFIX, "$CCP4/bin/ccp4-python", "-c", conv_py, os.linesep], tab_cmd + ["<< eof >", os.devnull], [tab_stdin], ["eof"], [os.linesep], rot_cmd + ["<< eof >", rot_log], [rot_stdin], ["eof"], [os.linesep], ["grep", "-m 1", "SOLUTIONRCD", rot_log, os.linesep], ["rm", "-rf", "$CCP4_SCR\n"], [EXPORT, "CCP4_SCR=" + ccp4_scr], ] amore_script = pyjob.misc.make_script( cmd, directory=script_log_dir, prefix="amore_", stem=dat_model.pdb_code ) amore_log = amore_script.rsplit(".", 1)[0] + '.log' amore_files += [(amore_script, tab_stdin, rot_stdin, amore_log, dat_model.dat_path)] results = [] if len(amore_files) > 0: logger.info("Running AMORE tab/rot functions") amore_scripts, _, _, amore_logs, dat_models = zip(*amore_files) simbad.rotsearch.submit_chunk(amore_scripts, script_log_dir, nproc, 'simbad_amore', submit_qtype, submit_queue, monitor, self.rot_succeeded_log) for dat_model, amore_log in zip(dat_models, amore_logs): base = os.path.basename(amore_log) pdb_code = base.replace("amore_", "").replace(".log", "") try: rotsearch_parser = simbad.parsers.rotsearch_parser.AmoreRotsearchParser( amore_log ) score = simbad.core.amore_score.AmoreRotationScore( pdb_code, dat_model, rotsearch_parser.alpha, rotsearch_parser.beta, rotsearch_parser.gamma, rotsearch_parser.cc_f, rotsearch_parser.rf_f, rotsearch_parser.cc_i, rotsearch_parser.cc_p, rotsearch_parser.icp, rotsearch_parser.cc_f_z_score, rotsearch_parser.cc_p_z_score, rotsearch_parser.num_of_rot ) if rotsearch_parser.cc_f_z_score: results += [score] except IOError: pass else: logger.critical("No structures to be trialled") self._search_results = results shutil.rmtree(script_log_dir) if os.path.isdir(default_tmp_dir): shutil.rmtree(default_tmp_dir)
[docs] def summarize(self, csv_file): """Summarize the search results Parameters ---------- csv_file : str The path for a backup CSV file Raises ------ No results found """ from simbad.util import summarize_result columns = [ "ALPHA", "BETA", "GAMMA", "CC_F", "RF_F", "CC_I", "CC_P", "Icp", "CC_F_Z_score", "CC_P_Z_score", "Number_of_rotation_searches_producing_peak" ] summarize_result(self.search_results, csv_file=csv_file, columns=columns)
def _generate_hklpck0(self): f, sigf, _, _, _, _, _ = simbad.util.mtz_util.get_labels(self.mtz) logger.info("Preparing files for AMORE rotation function") stdin = self.sortfun_stdin_template.format(f=f, sigf=sigf) hklpck0 = os.path.join(self.work_dir, 'spmipch.hkl') cmd = [self.amore_exe, 'hklin', self.mtz, 'hklpck0', hklpck0] pyjob.cexec(cmd, stdin=stdin) return hklpck0 def _write_stdin(self, directory, prefix, name, content): f = pyjob.misc.tmp_file(directory=directory, prefix=prefix, stem=name, suffix=".stdin") with open(f, "w") as f_out: f_out.write(content) return f @property def search_results(self): return sorted(self._search_results, key=lambda x: float(x.CC_F_Z_score), reverse=True)[:self.max_to_keep] @property def sortfun_stdin_template(self): return """TITLE ** spmi packing h k l F for crystal** SORTFUN RESOL 100. 2.5 LABI FP={f} SIGFP={sigf}""" @property def tabfun_stdin_template(self): return """TITLE: Produce table for MODEL FRAGMENT TABFUN CRYSTAL {x} {y} {z} {a} {b} {c} ORTH 1 MODEL 1 BTARGET 23.5 SAMPLE 1 RESO 2.5 SHANN 2.5 SCALE 4.0""" @property def rotfun_stdin_template(self): return """TITLE: Generate HKLPCK1 from MODEL FRAGMENT 1 ROTFUN GENE 1 RESO 100.0 {shres} CELL_MODEL 80 75 65 CLMN CRYSTAL ORTH 1 RESO 20.0 {shres} SPHERE {intrad} CLMN MODEL 1 RESO 20.0 {shres} SPHERE {intrad} ROTA CROSS MODEL 1 PKLIM {pklim} NPIC {npic} STEP {step}""" @staticmethod def _rot_job_succeeded(amore_z_score): """Check values for job success""" return amore_z_score > 10
[docs] def rot_succeeded_log(self, log): """Check a rotation search job for it's success Parameters ---------- log : str The path to a log file Returns ------- bool Success status of the rot run """ if self.skip_mr: return False rot_prog, pdb = os.path.basename(log).replace('.log', '').split('_', 1) rotsearch_parser = simbad.parsers.rotsearch_parser.AmoreRotsearchParser(log) dat_model = [s for s in self.simbad_dat_files if pdb in s][0] score = simbad.core.amore_score.AmoreRotationScore( pdb, dat_model, rotsearch_parser.alpha, rotsearch_parser.beta, rotsearch_parser.gamma, rotsearch_parser.cc_f, rotsearch_parser.rf_f, rotsearch_parser.cc_i, rotsearch_parser.cc_p, rotsearch_parser.icp, rotsearch_parser.cc_f_z_score, rotsearch_parser.cc_p_z_score, rotsearch_parser.num_of_rot ) results = [score] if self._rot_job_succeeded(rotsearch_parser.cc_f_z_score) and pdb not in self.tested: self.tested.append(pdb) output_dir = os.path.join(self.work_dir, "mr_search") mr = simbad.mr.MrSubmit(mtz=self.mtz, mr_program=self.mr_program, refine_program='refmac5', refine_type=None, refine_cycles=0, output_dir=output_dir, tmp_dir=self.tmp_dir, timeout=30) mr.mute = True mr.submit_jobs(results, nproc=1, process_all=True, submit_qtype=self.submit_qtype, submit_queue=self.submit_queue) refmac_log = os.path.join(output_dir, pdb, "mr", self.mr_program, "refine", pdb + "_ref.log") if os.path.isfile(refmac_log): refmac_parser = simbad.parsers.refmac_parser.RefmacParser(refmac_log) return simbad.rotsearch.mr_job_succeeded(refmac_parser.final_r_fact, refmac_parser.final_r_free) return False