"""Script to download or update SIMBAD-related databases"""
__author__ = "Felix Simkovic & Adam Simpkin"
__date__ = "17 May 2017"
__version__ = "1.0"
import argparse
import datetime
import glob
import json
import numpy as np
import morda
import os
import shutil
import sys
import tarfile
import urllib2
from distutils.version import StrictVersion
from pyjob import Job
from pyjob.misc import StopWatch, make_script, tmp_dir
import cctbx.crystal
import simbad
import simbad.command_line
import simbad.db
import simbad.exit
import simbad.rotsearch.amore_search
from simbad.util.pdb_util import PdbStructure
logger = None
# The space groups in the list below cannot be recognized by CCTBX, so we convert them
# to similar ones understandable by the library
CCTBX_ERROR_SG = {
'A1': 'P1',
'B2': 'B112',
'C1211': 'C2',
'F422': 'I422',
'I21': 'I2',
'I1211': 'I2',
'P21212A': 'P212121',
'R3': 'R3:R',
'C4212': 'P422',
}
SYS_PLATFORM = sys.platform
CUSTOM_PLATFORM = "linux" if SYS_PLATFORM in ["linux", "linux2"] \
else "mac" if SYS_PLATFORM in ["darwin"] \
else "windows"
[docs]class ContaminantSearchResult(object):
"""A basic contabase storing class"""
__slots__ = ('pdb_code', 'space_group', 'uniprot_name', 'uniprot_mnemonic')
def __init__(self, pdb_code, space_group, uniprot_name, uniprot_mnemonic):
self.pdb_code = pdb_code
self.space_group = space_group
self.uniprot_name = uniprot_name
self.uniprot_mnemonic = uniprot_mnemonic
def __repr__(self):
template = "{name}(pdb_code={pdb_code} space_group={space_group} " \
"uniprot_name={uniprot_name} uniprot_mnemonic={uniprot_mnemonic})"
return template.format(name=self.__class__.__name__, **{k: getattr(self, k) for k in self.__class__.__slots__})
def _as_dict(self):
"""Convert the :obj: BlastSearchResult object to a dictionary"""
dictionary = {}
for k in self.__slots__:
dictionary[k] = getattr(self, k)
return dictionary
[docs]def is_valid_db_location(database):
"""Validate permissions for a database"""
return os.access(os.path.dirname(os.path.abspath(database)), os.W_OK)
[docs]def download_morda():
"""Download the MoRDa database
Returns
-------
str
The path to the downloaded MoRDa database
"""
logger.info("Downloading MoRDa database from CCP4")
url = "http://www.ccp4.ac.uk/morda/MoRDa_DB.tar.gz"
local_db = os.path.basename(url)
# http://stackoverflow.com/a/34831866/3046533
chunk_size = 1 << 20
with open(local_db, "wb") as f_out:
query = urllib2.urlopen(url)
while True:
chunk = query.read(chunk_size)
if not chunk:
break
f_out.write(chunk)
with tarfile.open(local_db) as tar:
members = [
tarinfo for tarinfo in tar.getmembers()
if tarinfo.path.startswith("MoRDa_DB/home/ca_DOM") or tarinfo.path.startswith("MoRDa_DB/home/ca_DB")
or tarinfo.path.startswith("MoRDa_DB/pdb_DB_gz") or tarinfo.path.startswith(
"MoRDa_DB/" + "bin_" + CUSTOM_PLATFORM) or tarinfo.path.startswith("MoRDa_DB/list/domain_list.dat")
]
tar.extractall(members=members)
os.remove(local_db)
os.environ["MRD_DB"] = os.path.abspath("MoRDa_DB")
os.environ["MRD_PROG"] = os.path.join(os.path.abspath("MoRDa_DB"), "bin_" + CUSTOM_PLATFORM)
[docs]def create_lattice_db(database):
"""Create a lattice search database
Parameters
----------
database : str
The path to the database file
"""
if not is_valid_db_location(database):
raise RuntimeError("Permission denied! Cannot write to {}!".format(os.path.dirname(database)))
logger.info('Querying the RCSB Protein DataBank')
url = 'http://www.rcsb.org/pdb/rest/customReport.csv?pdbids=*&customReportColumns=lengthOfUnitCellLatticeA,'\
+ 'lengthOfUnitCellLatticeB,lengthOfUnitCellLatticeC,unitCellAngleAlpha,unitCellAngleBeta,' \
'unitCellAngleGamma,spaceGroup,experimentalTechnique&service=wsfile&format=csv'
crystal_data, error_count = [], 0
for line in urllib2.urlopen(url):
if line.startswith('structureId'):
continue
pdb_code, rest = line[1:-1].split('","', 1)
unit_cell, space_group, exp_tech = rest.rsplit('","', 2)
unit_cell = unit_cell.replace('","', ',')
space_group = space_group.replace(" ", "").strip()
if "X-RAY DIFFRACTION" not in exp_tech.strip().upper():
continue
try:
unit_cell = map(float, unit_cell.split(','))
except ValueError as e:
logger.debug('Skipping pdb entry %s\t%s', pdb_code, e)
error_count += 1
continue
space_group = CCTBX_ERROR_SG.get(space_group, space_group)
try:
symmetry = cctbx.crystal.symmetry(
unit_cell=unit_cell, space_group=space_group, correct_rhombohedral_setting_if_necessary=True)
except Exception as e:
logger.debug('Skipping pdb entry %s\t%s', pdb_code, e)
error_count += 1
continue
crystal_data.append((pdb_code, symmetry))
logger.debug('Error processing %d pdb entries', error_count)
logger.info('Calculating the Niggli cells')
niggli_data = np.zeros((len(crystal_data), 11))
# Leave this as list, .append is faster than .vstack
alt_niggli_data = []
for i, xtal_data in enumerate(crystal_data):
niggli_data[i][:4] = np.fromstring(xtal_data[0], dtype='uint8').astype(np.float64)
niggli_data[i][4] = ord('\x00')
niggli_data[i][5:] = np.array(xtal_data[1].niggli_cell().unit_cell().parameters()).round(decimals=3)
a, b, c, alpha, beta, gamma = niggli_data[i][5:]
# Add alternate niggli cell where a and b may be flipped
if np.allclose(a, b, atol=(b / 100.0 * 1.0)) and a != b and alpha != beta:
alt_niggli_data += [np.concatenate((niggli_data[i][:4], np.array([ord('*'), b, a, c, beta, alpha, gamma])))]
# Add alternate niggli cell where b and c may be flipped
if np.allclose(b, c, atol=(c / 100.0 * 1.0)) and b != c and beta != gamma:
alt_niggli_data += [np.concatenate((niggli_data[i][:4], np.array([ord('*'), a, c, b, alpha, gamma, beta])))]
niggli_data = np.vstack([niggli_data, np.asarray(alt_niggli_data)])
logger.info("Total Niggli cells loaded: %d", niggli_data.shape[0])
if not database.endswith('.npz'):
database += ".npz"
logger.info('Storing database in file: %s', database)
np.savez_compressed(database, niggli_data)
[docs]def create_contaminant_db(database, add_morda_domains, nproc=2, submit_qtype=None, submit_queue=False):
"""Create a contaminant database
Parameters
----------
database : str
The path to the database folder
add_morda_domains : bool
Retrospectively add morda domains to a contaminant database updated when morda was not installed
nproc : int, optional
The number of processors [default: 2]
submit_qtype : str
The cluster submission queue type - currently support SGE and LSF
submit_queue : str
The queue to submit to on the cluster
Raises
------
RuntimeError
dimple.contaminants.prepare module not available
RuntimeError
Windows is currently not supported
"""
if not is_valid_db_location(database):
raise RuntimeError("Permission denied! Cannot write to {}!".format(os.path.dirname(database)))
import dimple.main
if StrictVersion(dimple.main.__version__) < StrictVersion('2.5.7'):
msg = "This feature will be available with dimple version 2.5.7"
raise RuntimeError(msg)
if CUSTOM_PLATFORM == "windows":
msg = "Windows is currently not supported"
raise RuntimeError(msg)
import dimple.contaminants.prepare
dimple.contaminants.prepare.main(verbose=False)
simbad_dat_path = os.path.join(database, '*', '*', '*', '*.dat')
existing_dat_files = [os.path.basename(f).split('.')[0].lower() for f in glob.iglob(simbad_dat_path)]
erroneous_files = ['4v43']
dimple_files = ['cached', 'data.json', 'data.py']
with open("data.json") as data_file:
data = json.load(data_file)
results = []
for child in data["children"]:
try:
for child_2 in child["children"]:
space_group = child_2["name"].replace(" ", "")
for child_3 in child_2["children"]:
pdb_code = child_3["name"].split()[0].lower()
if (pdb_code in existing_dat_files or pdb_code in erroneous_files) and not add_morda_domains:
continue
uniprot_name = child["name"]
uniprot_mnemonic = uniprot_name.split('_')[1]
score = ContaminantSearchResult(pdb_code, space_group, uniprot_name, uniprot_mnemonic)
results.append(score)
except KeyError:
pass
if len(results) == 0:
logger.info("Contaminant database up to date")
else:
if add_morda_domains:
logger.info("Adding morda domains to contaminant database")
else:
logger.info("%d new entries were found in the contaminant database, " + "updating SIMBAD database",
len(results))
if "MRD_DB" in os.environ:
morda_installed_through_ccp4 = True
else:
morda_installed_through_ccp4 = False
if add_morda_domains and not morda_installed_through_ccp4:
logger.critical("Morda not installed locally, unable to add morda domains to contaminant database")
if morda_installed_through_ccp4:
morda_dat_path = os.path.join(os.environ['MRD_DB'], 'home', 'ca_DOM', '*.dat')
morda_dat_files = set([os.path.basename(f) for f in glob.iglob(morda_dat_path)])
exe = os.path.join(os.environ['MRD_PROG'], "get_model")
else:
logger.info(
"Morda not installed locally, therefore morda domains will not be added to contaminant database")
what_to_do = []
for result in results:
stem = os.path.join(os.getcwd(), database, result.uniprot_mnemonic, result.uniprot_name, result.space_group)
if not os.path.exists(stem):
os.makedirs(stem)
content = PdbStructure.get_pdb_content(result.pdb_code)
if content is None:
logger.debug("Encountered a problem downloading PDB %s - skipping entry", result.pdb_code)
else:
dat_content = simbad.db._str_to_dat(content)
with open(os.path.join(stem, result.pdb_code + ".dat"), "w") as f_out:
f_out.write(dat_content)
if simbad.db.is_valid_dat(os.path.join(stem, result.pdb_code + ".dat")):
pass
else:
logger.debug("Unable to convert %s to dat file", result.pdb_code)
if morda_installed_through_ccp4:
for dat_file in morda_dat_files:
if result.pdb_code.lower() == dat_file[0:4]:
stem = os.path.join(database, result.uniprot_mnemonic, result.uniprot_name, result.space_group,
"morda")
if not os.path.exists(stem):
os.makedirs(stem)
code = dat_file.rsplit('.', 1)[0]
final_file = os.path.join(stem, dat_file)
tmp_d = tmp_dir(directory=os.getcwd())
get_model_output = os.path.join(tmp_d, code + ".pdb")
script = make_script(
[["export CCP4_SCR=", tmp_d], ["cd", tmp_d], [exe, "-c", code, "-m", "d"]], directory=tmp_d)
log = script.rsplit('.', 1)[0] + '.log'
what_to_do += [(script, log, tmp_d, (get_model_output, final_file))]
if len(what_to_do) > 0:
scripts, _, tmps, files = zip(*what_to_do)
j = Job(submit_qtype)
j.submit(scripts, name='cont_db', nproc=nproc, queue=submit_queue)
j.wait()
for output, final in files:
if os.path.isfile(output):
simbad.db.convert_pdb_to_dat(output, final)
else:
print "File missing: {}".format(output)
for d in tmps:
shutil.rmtree(d)
for f in dimple_files:
if os.path.isdir(f):
shutil.rmtree(f)
elif os.path.isfile(f):
os.remove(f)
validate_compressed_database(database)
[docs]def create_morda_db(database, nproc=2, submit_qtype=None, submit_queue=False, chunk_size=5000):
"""Create the MoRDa search database
Parameters
----------
database : str
The path to the database folder
nproc : int, optional
The number of processors [default: 2]
submit_qtype : str
The cluster submission queue type - currently support SGE and LSF
submit_queue : str
The queue to submit to on the cluster
chunk_size : int, optional
The number of jobs to submit at the same time [default: 5000]
Raises
------
RuntimeError
Windows is currently not supported
"""
if CUSTOM_PLATFORM == "windows":
msg = "Windows is currently not supported"
raise RuntimeError(msg)
if not is_valid_db_location(database):
raise RuntimeError("Permission denied! Cannot write to {}!".format(os.path.dirname(database)))
if "MRD_DB" in os.environ:
morda_installed_through_ccp4 = True
else:
download_morda()
morda_installed_through_ccp4 = False
morda_dat_path = os.path.join(os.environ['MRD_DB'], 'home', 'ca_DOM', '*.dat')
simbad_dat_path = os.path.join(database, '**', '*.dat')
morda_dat_files = set([os.path.basename(f) for f in glob.glob(morda_dat_path)])
simbad_dat_files = set([os.path.basename(f) for f in glob.glob(simbad_dat_path)])
erroneous_files = set(["1bbzA_0.dat", "1gt0D_0.dat", "1h3oA_0.dat", "1kskA_1.dat", "1l0sA_0.dat"])
def delete_erroneous_files(erroneous_paths):
for f in erroneous_paths:
if os.path.isfile(f):
logger.warning("File flagged to be erroneous ... " + "removing from database: %s", f)
os.remove(f)
erroneous_paths = [os.path.join(database, name[1:3], name) for name in erroneous_files]
delete_erroneous_files(erroneous_paths)
dat_files = list(morda_dat_files - simbad_dat_files - erroneous_files)
if len(dat_files) < 1:
logger.info('SIMBAD database up-to-date')
if not morda_installed_through_ccp4:
shutil.rmtree(os.environ["MRD_DB"])
leave_timestamp(os.path.join(database, 'simbad_morda.txt'))
return
else:
logger.info("%d new entries were found in the MoRDa database, " + "updating SIMBAD database", len(dat_files))
exe = os.path.join(os.environ["MRD_PROG"], "get_model")
run_dir = tmp_dir(directory=os.getcwd())
# Submit in chunks, so we don't take too much disk space
# and can terminate without loosing the processed data
total_chunk_cycles = len(dat_files) // chunk_size + (len(dat_files) % 5 > 0)
for cycle, i in enumerate(range(0, len(dat_files), chunk_size)):
logger.info("Working on chunk %d out of %d", cycle + 1, total_chunk_cycles)
chunk_dat_files = dat_files[i:i + chunk_size]
# Create the database files
what_to_do = []
for f in chunk_dat_files:
code = os.path.basename(f).rsplit('.', 1)[0]
final_file = os.path.join(database, code[1:3], code + ".dat")
# We need a temporary directory within because "get_model" uses non-unique file names
tmp_d = tmp_dir(directory=run_dir)
get_model_output = os.path.join(tmp_d, code + ".pdb")
script = make_script(
[["export CCP4_SCR=", tmp_d], ["export MRD_DB=" + os.environ['MRD_DB']], ["cd", tmp_d],
[exe, "-c", code, "-m", "d"]],
directory=tmp_d)
log = script.rsplit('.', 1)[0] + '.log'
what_to_do += [(script, log, tmp_d, (get_model_output, final_file))]
scripts, _, tmps, files = zip(*what_to_do)
j = Job(submit_qtype)
j.submit(scripts, name='morda_db', nproc=nproc, queue=submit_queue)
j.wait()
sub_dir_names = set([os.path.basename(f).rsplit('.', 1)[0][1:3] for f in chunk_dat_files])
for sub_dir_name in sub_dir_names:
sub_dir = os.path.join(database, sub_dir_name)
if os.path.isdir(sub_dir):
continue
os.makedirs(sub_dir)
for output, final in files:
if os.path.isfile(output):
simbad.db.convert_pdb_to_dat(output, final)
else:
logger.critical("File missing: {}".format(output))
for d in tmps:
shutil.rmtree(d)
shutil.rmtree(run_dir)
if not morda_installed_through_ccp4:
shutil.rmtree(os.environ["MRD_DB"])
validate_compressed_database(database)
leave_timestamp(os.path.join(database, 'simbad_morda.txt'))
[docs]def create_db_custom(custom_db, database):
"""Create custom database
Parameters
----------
custom_db : str
The path to the input database of PDB files
database : str
The path to the output database folder
Raises
------
RuntimeError
Windows is currently not supported
"""
if CUSTOM_PLATFORM == "windows":
msg = "Windows is currently not supported"
raise RuntimeError(msg)
if not is_valid_db_location(custom_db):
raise RuntimeError("Permission denied! Cannot write to {}!".format(os.path.dirname(custom_db)))
custom_dat_files = set([
os.path.join(root, filename) for root, _, files in os.walk(custom_db) for filename in files
if filename.endswith('.pdb')
])
simbad_dat_path = os.path.join(database, '**', '*.dat')
simbad_dat_files = set([os.path.basename(f) for f in glob.glob(simbad_dat_path)])
dat_files = list(custom_dat_files - simbad_dat_files)
if len(dat_files) < 1:
logger.info('SIMBAD dat database up-to-date')
leave_timestamp(os.path.join(database, 'simbad_custom.txt'))
return
else:
logger.info("%d new entries were found in the custom database, updating SIMBAD database", len(dat_files))
files = []
for input_file in dat_files:
code = os.path.basename(input_file).rsplit('.', 1)[0]
time_stamp = str(datetime.datetime.now())
final_file = os.path.join(database, time_stamp, code + ".dat")
files += [(input_file, final_file)]
sub_dir = os.path.join(database, time_stamp)
if os.path.isdir(sub_dir):
continue
os.makedirs(sub_dir)
for output, final in files:
simbad.db.convert_pdb_to_dat(output, final)
validate_compressed_database(database)
leave_timestamp(os.path.join(database, 'simbad_custom.txt'))
[docs]def create_db_argparse():
"""Argparse function database creationg"""
p = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
sp = p.add_subparsers(help='Database-specific commands')
pa = sp.add_parser('lattice', help='lattice database')
pa.set_defaults(which="lattice")
pa.add_argument(
'-debug_lvl',
type=str,
default='info',
help='The console verbosity level < notset | info | debug | warning | error | critical > ')
pa.add_argument('-latt_db', type=str, default=simbad.LATTICE_DB, help='Path to local copy of the lattice database')
pb = sp.add_parser('contaminant', help='Contaminant database')
pb.set_defaults(which="contaminant")
simbad.command_line._argparse_job_submission_options(pb)
pb.add_argument(
'-debug_lvl',
type=str,
default='info',
help='The console verbosity level < notset | info | debug | warning | error | critical > ')
pb.add_argument(
'-cont_db', type=str, default=simbad.CONTAMINANT_MODELS, help='Path to local copy of the contaminant database')
pb.add_argument(
'--add_morda_domains',
default=False,
action="store_true",
help="Retrospectively add morda domains to an updated contaminant database")
pc = sp.add_parser('morda', help='morda database')
pc.set_defaults(which="morda")
simbad.command_line._argparse_job_submission_options(pc)
pc.add_argument(
'-chunk_size', default=5000, type=int, help='Max jobs to submit at any given time [disk space dependent')
pc.add_argument(
'-debug_lvl',
type=str,
default='info',
help='The console verbosity level < notset | info | debug | warning | error | critical > ')
pc.add_argument('simbad_db', type=str, help='Path to local copy of the SIMBAD database')
pd = sp.add_parser('custom', help='custom database')
pd.set_defaults(which="custom")
pd.add_argument(
'custom_db', type=str, help='Path to local copy of the custom database of PDB files in SIMBAD format')
pd.add_argument(
'-debug_lvl',
type=str,
default='info',
help='The console verbosity level < notset | info | debug | warning | error | critical > ')
pd.add_argument('input_db', type=str, help='Path to local copy of the custom database of PDB files')
pe = sp.add_parser('validate', help='validate database')
pe.set_defaults(which="validate")
pe.add_argument(
'-debug_lvl',
type=str,
default='info',
help='The console verbosity level < notset | info | debug | warning | error | critical > ')
pe.add_argument('database', type=str, help='The database to validate')
return p
[docs]def leave_timestamp(f):
"""Write the current date & time to a file"""
with open(f, 'w') as f_out:
f_out.write(str(datetime.datetime.now()))
[docs]def validate_compressed_database(dir):
"""Validate an installation of a SIMBAD database"""
logger.info("Validating compressed database")
for directory, _, files in os.walk(dir):
for f in files:
infile = os.path.join(directory, f)
if infile.endswith(".dat"):
logger.debug("Validating file: %s", infile)
if not simbad.db.is_valid_dat(infile):
logger.critical("Corrupted file: %s", infile)
else:
logger.debug("Ignoring file: %s", infile)
[docs]def main():
"""SIMBAD database creation function"""
p = create_db_argparse()
args = p.parse_args()
global logger
log_class = simbad.command_line.LogController()
log_class.add_console(level=args.debug_lvl)
logger = log_class.get_logger()
simbad.command_line.print_header()
stopwatch = StopWatch()
stopwatch.start()
if args.which == "lattice":
create_lattice_db(args.latt_db)
elif args.which == "contaminant":
create_contaminant_db(
args.cont_db,
args.add_morda_domains,
nproc=args.nproc,
submit_qtype=args.submit_qtype,
submit_queue=args.submit_queue)
elif args.which == "morda":
create_morda_db(
args.simbad_db,
nproc=args.nproc,
submit_qtype=args.submit_qtype,
submit_queue=args.submit_queue,
chunk_size=args.chunk_size)
elif args.which == "custom":
create_db_custom(args.input_db, args.custom_db)
elif args.which == "validate":
if os.path.isdir(args.database):
validate_compressed_database(args.database)
else:
logger.critical("Unable to validate the following database: %s", args.database)
stopwatch.stop()
logger.info("Database creation completed in %d days, %d hours, %d minutes, and %d seconds", *stopwatch.time_pretty)
log_class.close()
if __name__ == "__main__":
import logging
logging.basicConfig(level=logging.NOTSET)
try:
main()
except Exception:
simbad.exit.exit_error(*sys.exc_info())