Source code for simbad.rotsearch.amore_search

"""Module to run the AMORE rotation search"""

__author__ = "Adam Simpkin & Felix Simkovic"
__date__ = "10 Oct 2017"
__version__ = "0.3"

import logging
import math
import os
import shutil
import uuid
logger = logging.getLogger(__name__)

import pyjob
import pyjob.misc

import simbad.db
import simbad.rotsearch.amore_score
import simbad.parsers.rotsearch_parser
import simbad.util.pdb_util
import simbad.util.mtz_util
import simbad.util.matthews_coef

EXPORT = "SET" if os.name == "nt" else "export"


[docs]class AmoreRotationSearch(object): """A class to perform the amore rotation search Attributes ---------- amore_exe : str The path to the amore executable mtz : str The path to the input MTZ work_dir : str The path to the working directory max_to_keep : int The maximum number of results to keep [default: 20] search_results : list The search results Examples -------- >>> from simbad.rotsearch.amore_search import AmoreRotationSearch >>> rotation_search = AmoreRotationSearch('<amore_exe>', '<mtz>', '<work_dir>', '<max_to_keep>') >>> rotation_search.run( ... '<models_dir>', '<output_dir>', '<nproc>', '<shres>', '<pklim>', '<npic>', '<rotastep>', ... '<min_solvent_content>', '<submit_qtype>', '<submit_queue>', '<monitor>', '<chunk_size>' ... ) >>> rotation_search.summarize() >>> search_results = rotation_search.search_results If any results are found, an object is returned containing the pdb_code, and the various associated scores from amore. """ def __init__(self, amore_exe, mtz, tmp_dir, work_dir, max_to_keep=20): self.amore_exe = amore_exe self.max_to_keep = max_to_keep self.mtz = mtz self.tmp_dir = tmp_dir self.work_dir = work_dir self._search_results = None
[docs] def run(self, models_dir, nproc=2, shres=3.0, pklim=0.5, npic=50, rotastep=1.0, min_solvent_content=20, submit_qtype=None, submit_queue=None, monitor=None, chunk_size=0): """Run amore rotation function on a directory of models Parameters ---------- models_dir : str The directory containing the models to run the rotation search on nproc : int, optional The number of processors to run the job on shres : int, float, optional Spherical harmonic resolution [default 3.0] pklim : int, float, optional Peak limit, output all peaks above <float> [default: 0.5] npic : int, optional Number of peaks to output from the translation function map for each orientation [default: 50] rotastep : int, float, optional Size of rotation step [default : 1.0] min_solvent_content : int, float, optional The minimum solvent content present in the unit cell with the input model [default: 30] submit_qtype : str The cluster submission queue type - currently support SGE and LSF submit_queue : str The queue to submit to on the cluster monitor chunk_size : int, optional The number of jobs to submit at the same time Returns ------- file log file for each model in the models_dir """ simbad_dat_files = simbad.db.find_simbad_dat_files(models_dir) n_files = len(simbad_dat_files) sg, _, cell = simbad.util.mtz_util.crystal_data(self.mtz) cell = " ".join(map(str, cell)) chunk_size = AmoreRotationSearch.get_chunk_size(n_files, chunk_size) total_chunk_cycles = AmoreRotationSearch.get_total_chunk_cycles(n_files, chunk_size) sol_calc = simbad.util.matthews_coef.SolventContent(cell, sg) dir_name = "simbad-tmp-" + str(uuid.uuid1()) script_log_dir = os.path.join(self.work_dir, dir_name) os.mkdir(script_log_dir) hklpck0 = self._generate_hklpck0() ccp4_scr = os.environ["CCP4_SCR"] default_tmp_dir = os.path.join(self.work_dir, 'tmp') if self.tmp_dir: template_tmp_dir = os.path.join(self.tmp_dir, dir_name + "-{0}") else: template_tmp_dir = os.path.join(default_tmp_dir, dir_name + "-{0}") template_hklpck1 = os.path.join("$CCP4_SCR", "{0}.hkl") template_clmn0 = os.path.join("$CCP4_SCR", "{0}_spmipch.clmn") template_clmn1 = os.path.join("$CCP4_SCR", "{0}.clmn") template_mapout = os.path.join("$CCP4_SCR", "{0}_amore_cross.map") template_table1 = os.path.join("$CCP4_SCR", "{0}_sfs.tab") template_model = os.path.join("$CCP4_SCR", "{0}.pdb") template_rot_log = os.path.join("$CCP4_SCR", "{0}_rot.log") iteration_range = range(0, n_files, chunk_size) for cycle, i in enumerate(iteration_range): logger.info("Working on chunk %d out of %d", cycle + 1, total_chunk_cycles) amore_files = [] for dat_model in simbad_dat_files[i:i + chunk_size]: name = os.path.basename(dat_model.replace(".dat", "")) pdb_struct = simbad.util.pdb_util.PdbStructure(dat_model) solvent_content = sol_calc.calculate_from_struct(pdb_struct) if solvent_content < min_solvent_content: msg = "Skipping %s: solvent content is predicted to be less than %.2f" logger.debug(msg, name, min_solvent_content) continue x, y, z, intrad = pdb_struct.integration_box logger.debug("Generating script to perform AMORE rotation " + "function on %s", name) pdb_model = template_model.format(name) table1 = template_table1.format(name) hklpck1 = template_hklpck1.format(name) clmn0 = template_clmn0.format(name) clmn1 = template_clmn1.format(name) mapout = template_mapout.format(name) conv_py = "\"from simbad.db import convert_dat_to_pdb; convert_dat_to_pdb('{}', '{}')\"" conv_py = conv_py.format(dat_model, pdb_model) tab_cmd = [self.amore_exe, "xyzin1", pdb_model, "xyzout1", pdb_model, "table1", table1] tab_stdin = self.tabfun_stdin_template.format( x=x, y=y, z=z, a=90, b=90, c=120 ) rot_cmd = [self.amore_exe, 'table1', table1, 'HKLPCK1', hklpck1, 'hklpck0', hklpck0, 'clmn1', clmn1, 'clmn0', clmn0, 'MAPOUT', mapout] rot_stdin = self.rotfun_stdin_template.format( shres=shres, intrad=intrad, pklim=pklim, npic=npic, step=rotastep ) rot_log = template_rot_log.format(name) tmp_dir = template_tmp_dir.format(name) cmd = [ [EXPORT, "CCP4_SCR=" + tmp_dir], ["mkdir", "-p", "$CCP4_SCR\n"], ["$CCP4/bin/ccp4-python", "-c", conv_py, "\n"], tab_cmd + ["<< eof >", os.devnull], [tab_stdin], ["eof\n"], rot_cmd + ["<< eof >", rot_log], [rot_stdin], ["eof\n"], ["grep", "-m 1", "SOLUTIONRCD", rot_log, "\n"], ["rm", "-rf", "$CCP4_SCR\n"], [EXPORT, "CCP4_SCR=" + ccp4_scr], ] amore_script = pyjob.misc.make_script( cmd, directory=script_log_dir, prefix="amore_", stem=name ) amore_log = amore_script.rsplit(".", 1)[0] + '.log' amore_files += [(amore_script, tab_stdin, rot_stdin, amore_log, dat_model)] results = [] if len(amore_files) > 0: logger.info("Running AMORE tab/rot functions") amore_scripts, _, _, amore_logs, dat_models = zip(*amore_files) self.submit_chunk(amore_scripts, script_log_dir, nproc, 'simbad_amore', submit_qtype, submit_queue, monitor) for dat_model, amore_log in zip(dat_models, amore_logs): base = os.path.basename(amore_log) pdb_code = base.replace("amore_", "").replace(".log", "") RP = simbad.parsers.rotsearch_parser.RotsearchParser( amore_log ) score = simbad.rotsearch.amore_score.AmoreRotationScore( pdb_code, dat_model, RP.alpha, RP.beta, RP.gamma, RP.cc_f, RP.rf_f, RP.cc_i, RP.cc_p, RP.icp, RP.cc_f_z_score, RP.cc_p_z_score, RP.num_of_rot ) if RP.cc_f_z_score: results += [score] else: logger.critical("No structures to be trialled") self._search_results = results shutil.rmtree(script_log_dir) if os.path.isdir(default_tmp_dir): shutil.rmtree(default_tmp_dir)
[docs] def summarize(self, csv_file): """Summarize the search results Parameters ---------- csv_file : str The path for a backup CSV file Raises ------ No results found """ from simbad.util import summarize_result columns = [ "ALPHA", "BETA", "GAMMA", "CC_F", "RF_F", "CC_I", "CC_P", "Icp", "CC_F_Z_score", "CC_P_Z_score", "Number_of_rotation_searches_producing_peak" ] summarize_result(self.search_results, csv_file=csv_file, columns=columns)
def _generate_hklpck0(self): logger.info("Preparing files for AMORE rotation function") f, sigf, _, _, _, _, _ = simbad.util.mtz_util.get_labels(self.mtz) stdin = self.sortfun_stdin_template.format(f=f, sigf=sigf) hklpck0 = os.path.join(self.work_dir, 'spmipch.hkl') cmd = [self.amore_exe, 'hklin', self.mtz, 'hklpck0', hklpck0] pyjob.cexec(cmd, stdin=stdin) return hklpck0 def _write_stdin(self, directory, prefix, name, content): f = pyjob.misc.tmp_file(directory=directory, prefix=prefix, stem=name, suffix=".stdin") with open(f, "w") as f_out: f_out.write(content) return f @property def search_results(self): return sorted(self._search_results, key=lambda x: float(x.CC_F_Z_score), reverse=True)[:self.max_to_keep] @property def sortfun_stdin_template(self): return """TITLE ** spmi packing h k l F for crystal** SORTFUN RESOL 100. 2.5 LABI FP={f} SIGFP={sigf}""" @property def tabfun_stdin_template(self): return """TITLE: Produce table for MODEL FRAGMENT TABFUN CRYSTAL {x} {y} {z} {a} {b} {c} ORTH 1 MODEL 1 BTARGET 23.5 SAMPLE 1 RESO 2.5 SHANN 2.5 SCALE 4.0""" @property def rotfun_stdin_template(self): return """TITLE: Generate HKLPCK1 from MODEL FRAGMENT 1 ROTFUN GENE 1 RESO 100.0 {shres} CELL_MODEL 80 75 65 CLMN CRYSTAL ORTH 1 RESO 20.0 {shres} SPHERE {intrad} CLMN MODEL 1 RESO 20.0 {shres} SPHERE {intrad} ROTA CROSS MODEL 1 PKLIM {pklim} NPIC {npic} STEP {step}"""
[docs] @staticmethod def get_chunk_size(total, size): return total if size == 0 else size
[docs] @staticmethod def get_total_chunk_cycles(total, step): total_chunk_cycles, remainder = divmod(total, step) if remainder > 0: return total_chunk_cycles + 1 else: return total_chunk_cycles
[docs] @staticmethod def submit_chunk(chunk_scripts, run_dir, nproc, job_name, submit_qtype, submit_queue, monitor): """Submit jobs in small chunks to avoid using too much disk space Parameters ---------- chunk_scripts : list List of scripts for each chunk nproc : int, optional The number of processors to run the job on job_name : str The name of the job to submit submit_qtype : str The cluster submission queue type - currently support SGE and LSF submit_queue : str The queue to submit to on the cluster """ j = pyjob.Job(submit_qtype) j.submit(chunk_scripts, directory=run_dir, name=job_name, nproc=nproc, queue=submit_queue, permit_nonzero=True) interval = int(math.log(len(chunk_scripts)) / 3) interval_in_seconds = interval if interval >= 5 else 5 j.wait(interval=interval_in_seconds, monitor=monitor)