diff --git a/ConfPrune.pyx b/ConfPrune.pyx new file mode 100644 index 0000000..76f60fe --- /dev/null +++ b/ConfPrune.pyx @@ -0,0 +1,435 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Jan 30 12:33:18 2015 + +@author: ke291 + +Cython file for conformer alignment and RMSD pruning. Called by Gaussian.py +and NWChem.py +""" + +from math import sqrt +from libc.stdlib cimport malloc, free +""" +Main function of this file. Takes an array of [x, y, z] for each molecule, +as well as a list of atom symbols in the same order as their coordinates + +Translates both molecules to origin, gets the rotation matrix from quatfit +and rotates the second molecule to align with the first +""" +def RMSDPrune(conformers, atoms, cutoff): + + ToDel = [] + cdef long int c1, c2, c + cdef long int l = len(conformers) + cdef double res + cdef double *RMSDMatrix = malloc(l * l * sizeof(double)) + + for c1 in range(0, l): + for c2 in range(c1, l): + if c1==c2: + RMSDMatrix[c2 + c1*l]=0.0 + else: + res = AlignedRMS(conformers[c1], conformers[c2], atoms) + RMSDMatrix[c2 + c1*l] = res + RMSDMatrix[c1 + c2*l] = res + #Check for similar conformations + for c1 in range(0, l): + for c2 in range(0, l): + if c1!=c2 and (not c1 in ToDel) and (not c2 in ToDel): + #if Align.AlignedRMS(conformers[c1], conformers[c2], atoms) < cutoff: + # ToDel.append(c2) + if RMSDMatrix[c2 + c1*l]malloc(l * l * sizeof(double)) + + for c1 in range(0, l): + for c2 in range(c1, l): + if c1==c2: + RMSDMatrix[c2 + c1*l]=0.0 + else: + res = AlignedRMS(conformers[c1], conformers[c2], atoms) + RMSDMatrix[c2 + c1*l] = res + RMSDMatrix[c1 + c2*l] = res + #Check for similar conformations + for c1 in range(0, l): + for c2 in range(0, l): + if c1!=c2 and (not c1 in ToDel) and (not c2 in ToDel): + if RMSDMatrix[c2 + c1*l]ConfLimit: + AdjCutoff +=0.2 + ToDel = [] + for c1 in range(0, l): + for c2 in range(0, l): + if c1!=c2 and (not c1 in ToDel) and (not c2 in ToDel): + if RMSDMatrix[c2 + c1*l]0.0): + dma = d[j] - d[i] + if (abs(dma)+abs(b) < abs(dma)): + t = b/dma + else: + q = 0.5 * dma / b + t = fsign(1.0 / (abs(q) + sqrt(1.0 + q*q)), q) + + c = 1.0 / sqrt(t*t + 1.0) + s = t * c + a[i][j] = 0.0 + + for k in range(0, i-1): + atemp = c * a[k][i] - s * a[k][j] + a[k][j] = s * a[k][i] + c * a[k][j] + a[k][i] = atemp + + for k in range(i+1, j): + atemp = c * a[i][k] - s * a[k][j] + a[k][j] = s * a[i][k] + c * a[k][j] + a[i][k] = atemp + + for k in range(j, n): + atemp = c * a[i][k] - s * a[j][k] + a[j][k] = s * a[i][k] + c * a[j][k] + a[i][k] = atemp + + for k in range(0, n): + vtemp = c * v[k][i] - s * v[k][j] + v[k][j] = s * v[k][i] + c * v[k][j] + v[k][i] = vtemp + + dtemp = c * c * d[i] + s * s * d[j] - 2.0 + c * s * b + d[j] = s * s * d[i] + c * c * d[j] + 2.0 * c * s * b + d[i] = dtemp + nrot = l + + for j in range(0, n-1): + k = j + dtemp = d[k] + for i in range(j, n): + if d[i]j: + d[k] = d[j] + d[j] = dtemp + for i in range(0, n): + dtemp = v[i][k] + v[i][k] = v[i][j] + v[i][j] = dtemp + + cdef double ret[5][4] + for i in range(0,4): + for j in range(0,4): + ret[i][j]=v[i][j] + for i in range(0,4): + ret[4][i] = d[i] + + +cdef q2mat(double q[4]): + + u = [[0.0, 0.0, 0.0],[0.0, 0.0, 0.0],[0.0, 0.0, 0.0]] + u[0][0] = q[0]*q[0] + q[1]*q[1] - q[2]*q[2] - q[3]*q[3] + u[1][0] = 2.0 *(q[1] * q[2] - q[0] * q[3]) + u[2][0] = 2.0 *(q[1] * q[3] + q[0] * q[2]) + u[0][1] = 2.0 *(q[2] * q[1] + q[0] * q[3]) + u[1][1] = q[0]*q[0] - q[1]*q[1] + q[2]*q[2] - q[3]*q[3] + u[2][1] = 2.0 *(q[2] * q[3] - q[0] * q[1]) + u[0][2] = 2.0 *(q[3] * q[1] - q[0] * q[2]) + u[1][2] = 2.0 *(q[3] * q[2] + q[0] * q[1]) + u[2][2] = q[0]*q[0] - q[1]*q[1] - q[2]*q[2] + q[3]*q[3] + + return u + +def fsign(a, b): + if (b>=0): + return a + else: + return -a \ No newline at end of file diff --git a/DP4.jar b/DP4.jar new file mode 100644 index 0000000..0e22309 Binary files /dev/null and b/DP4.jar differ diff --git a/DP4.py b/DP4.py new file mode 100644 index 0000000..79d1ca6 --- /dev/null +++ b/DP4.py @@ -0,0 +1,229 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed May 27 14:18:37 2015 +Updated on July 30 14:18:37 2015 +@author: ke291 + +Equivalent and compact port of DP4.jar to python. The results +produced are essentially equivalent, but not identical due to different +floating point precision used in the Python (53 bits) and Java (32 bits) +implementation. +""" +from scipy import stats +import pickle +import re +import bisect + +meanC = 0.0 +meanH = 0.0 +stdevC = 2.269372270818724 +stdevH = 0.18731058105269952 +output = [] + + +def main(Clabels, Cvalues, Hlabels, Hvalues, Cexp, Hexp, settings): + + Print(str(Cexp)) + Print(str(Hexp)) + + C_cdp4 = [] + H_cdp4 = [] + Comb_cdp4 = [] + + for isomer in range(0, len(Cvalues)): + + sortedClabels, sortedCvalues, sortedCexp =\ + AssignExpNMR(Clabels, Cvalues[isomer], Cexp) + scaledC = ScaleNMR(sortedCvalues, sortedCexp) + + sortedHlabels, sortedHvalues, sortedHexp = \ + AssignExpNMR(Hlabels, Hvalues[isomer], Hexp) + scaledH = ScaleNMR(sortedHvalues, sortedHexp) + + ScaledErrorsC = [sortedCexp[i] - scaledC[i] + for i in range(0, len(scaledC))] + ScaledErrorsH = [sortedHexp[i] - scaledH[i] + for i in range(0, len(scaledH))] + + Print("\nAssigned shifts for isomer " + str(isomer+1) + ": ") + PrintNMR('C', sortedClabels, sortedCvalues, scaledC, sortedCexp) + Print("Max C error: " + format(max(ScaledErrorsC, key=abs), "6.2f")) + PrintNMR('H', sortedHlabels, sortedHvalues, scaledH, sortedHexp) + Print("Max H error: " + format(max(ScaledErrorsH, key=abs), "6.2f")) + + if settings.PDP4: + C_cdp4.append(CalculateCDP4(ScaledErrorsC, meanC, stdevC)) + H_cdp4.append(CalculateCDP4(ScaledErrorsH, meanH, stdevH)) + elif settings.EP5: + C_cdp4.append(CalculatePDP4(ScaledErrorsC, meanC, stdevC)) + #C_cdp4.append(CalculateKDE(ScaledErrorsC, settings.ScriptDir + '/NucCErr.pkl')) + H_cdp4.append(CalculatePDP4(ScaledErrorsH, meanH, stdevH)) + #H_cdp4.append(CalculateKDE(ScaledErrorsH, settings.ScriptDir + '/NucHErr.pkl')) + Comb_cdp4.append(C_cdp4[-1]*H_cdp4[-1]) + Print("\nDP4 based on C: " + format(C_cdp4[-1], "6.2e")) + Print("DP4 based on H: " + format(H_cdp4[-1], "6.2e")) + + relCDP4 = [(100*x)/sum(C_cdp4) for x in C_cdp4] + relHDP4 = [(100*x)/sum(H_cdp4) for x in H_cdp4] + relCombDP4 = [(100*x)/sum(Comb_cdp4) for x in Comb_cdp4] + + PrintRelDP4('both carbon and proton data', relCombDP4) + PrintRelDP4('carbon data only', relCDP4) + PrintRelDP4('proton data only', relHDP4) + + return output + + +def AssignExpNMR(labels, calcShifts, exp): + + #Replace all 'or' and 'OR' with ',', remove all spaces and 'any' + exp = re.sub(r"or|OR", ',', exp, flags=re.DOTALL) + exp = re.sub(r" |any", '', exp, flags=re.DOTALL) + + #Get all assignments, split mulitassignments + ExpLabels = re.findall(r"(?<=\().*?(?=\))", exp, flags=re.DOTALL) + ExpLabels = [x.split(',') for x in ExpLabels] + + #Remove assignments and get shifts + ShiftData = (re.sub(r"\(.*?\)", "", exp, flags=re.DOTALL)).split(',') + expShifts = [float(x) for x in ShiftData] + + #Prepare sorted calculated data with labels and sorted exp data + sortedCalc = sorted(calcShifts) + sortedExp = sorted(expShifts) + sortedExpLabels = SortExpAssignments(expShifts, ExpLabels) + sortedCalcLabels = [] + for v in sortedCalc: + index = calcShifts.index(v) + sortedCalcLabels.append(labels[index]) + + assignedExpLabels = ['' for i in range(0, len(sortedExp))] + + #First pass - assign the unambiguous shifts + for v in range(0, len(sortedExp)): + if len(sortedExpLabels[v]) == 1 and sortedExpLabels[v][0] != '': + #Check that assignment exists in computational data + if sortedExpLabels[v][0] in labels: + assignedExpLabels[v] = sortedExpLabels[v][0] + else: + Print("Label " + sortedExpLabels[v][0] + + " not found in among computed shifts, please check NMR assignment.") + quit() + #Second pass - assign shifts from a limited set + for v in range(0, len(sortedExp)): + if len(sortedExpLabels[v]) != 1 and sortedExpLabels[v][0] != '': + for l in sortedCalcLabels: + if l in sortedExpLabels[v] and l not in assignedExpLabels: + assignedExpLabels[v] = l + break + #Final pass - assign unassigned shifts in order + for v in range(0, len(sortedExp)): + if sortedExpLabels[v][0] == '': + for l in sortedCalcLabels: # Take the first free label + if l not in assignedExpLabels: + assignedExpLabels[v] = l + break + sortedCalc = [] + #Rearrange calc values to match the assigned labels + for l in assignedExpLabels: + index = labels.index(l) + sortedCalc.append(calcShifts[index]) + + return assignedExpLabels, sortedCalc, sortedExp + + +def SortExpAssignments(shifts, assignments): + tempshifts = list(shifts) + tempassignments = list(assignments) + sortedassignments = [] + while len(tempassignments) > 0: + index = tempshifts.index(min(tempshifts)) + sortedassignments.append(tempassignments[index]) + tempshifts.pop(index) + tempassignments.pop(index) + return sortedassignments + + +#Scale the NMR shifts +def ScaleNMR(calcShifts, expShifts): + slope, intercept, r_value, p_value, std_err = stats.linregress(expShifts, + calcShifts) + scaled = [(x-intercept)/slope for x in calcShifts] + return scaled + + +def PrintNMR(nucleus, labels, values, scaled, exp): + Print("\nAssigned " + nucleus + + " shifts: (label, calc, corrected, exp, error)") + for i in range(0, len(labels)): + Print(format(labels[i], "6s") + ' ' + format(values[i], "6.2f") + ' ' + + format(scaled[i], "6.2f") + ' ' + format(exp[i], "6.2f") + ' ' + + format(exp[i]-scaled[i], "6.2f")) + + +def PrintRelDP4(title, RelDP4): + Print("\nResults of DP4 using " + title + ":") + for i in range(0, len(RelDP4)): + Print("Isomer " + str(i+1) + ": " + format(RelDP4[i], "4.1f") + "%") + + +def Print(s): + print s + output.append(s) + + +def CalculateCDP4(errors, expect, stdev): + cdp4 = 1.0 + for e in errors: + z = abs((e-expect)/stdev) + cdp4 = cdp4*2*stats.norm.cdf(-z) + return cdp4 + + +#Alternative function using probability density function instead of cdf +def CalculatePDP4(errors, expect, stdev): + pdp4 = 1.0 + for e in errors: + #z = (e-expect)/stdev + #pdp4 = pdp4*stats.norm.pdf(z) + pdp4 = pdp4*stats.norm(expect, stdev).pdf(e) + return pdp4 + + +#use as CalculateKDE(errors, 'NucCErr.pkl') for C or +#CalculateKDE(errors, 'NucHErr.pkl') for H +#load empirical error data from file and use KDE to construct pdf +def CalculateKDE(errors, PickleFile): + + pkl_file = open(PickleFile, 'rb') + ErrorData = pickle.load(pkl_file) + kde = stats.gaussian_kde(ErrorData) + + ep5 = 1.0 + for e in errors: + #z = (e-expect)/stdev + #pdp4 = pdp4*stats.norm.pdf(z) + ep5 = ep5*float(kde(e)[0]) + return ep5 + + +#use as CalculateRKDE(errors, 'RKDEC.pkl') for C or +#CalculateKDE(errors, 'RKDEH.pkl') for H +#load empirical error data from file and use KDE to construct several pdfs, +#one for each chemical shift region +def CalculateRKDE(errors, shifts, PickleFile): + + #Load the data + pkl_file = open(PickleFile, 'rb') + regions, RErrors = pickle.load(pkl_file) + + #Reconstruct the distributions for each region + kdes = [] + for es in RErrors: + kdes.append(stats.gaussian_kde(es)) + + ep5 = 1.0 + for i, e in enumerate(errors): + region = bisect.bisect_left(regions, shifts[i]) + ep5 = ep5*float((kdes[region])(e)[0]) + return ep5 diff --git a/FiveConf.py b/FiveConf.py new file mode 100644 index 0000000..672fddc --- /dev/null +++ b/FiveConf.py @@ -0,0 +1,259 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Dec 17 15:20:18 2014 + +@author: ke291 + +Gets called by PyDP4.py if automatic 5-membered cycle corner-flipping is used. +""" + +import numpy as np +from math import sqrt, pi, cos, sin, acos +import scipy.optimize as sciopt +import sys +sys.path.append( + '/home/ke291/Tools/openbabel-install/lib/python2.7/site-packages/') +from openbabel import * + + +def main(f, settings): + + """ + Find the axis atoms + Find all the atoms to be rotated + + Rotate it and the substiuents to the other side of the plane + """ + obconversion = OBConversion() + obconversion.SetInFormat("sdf") + obmol = OBMol() + + obconversion.ReadFile(obmol, f) + + obmol.ConnectTheDots() + + #Find the atoms composing furan ring + Rings = obmol.GetSSSR() + furan = [] + for ring in Rings: + if len(settings.RingAtoms) == 5: + if all(x in ring._path for x in settings.RingAtoms): + furan = ring + break + else: + if ring.Size() == 5 and not ring.IsAromatic(): + furan = ring + break + + if furan == []: + "No five membered rings to rotate. Quitting..." + quit() + #Find the plane of the 5-membered ring and the outlying atom + norm, d, outAtom = FindFuranPlane(obmol, furan) + + #Find the atoms connected to the outlying atom and sort them + #as either part of the ring(axis atoms) or as atoms to be rotated + AxisAtoms = [] + RotAtoms = [] + + for NbrAtom in OBAtomAtomIter(outAtom): + #if NbrAtom.IsInRingSize(5): + if furan.IsInRing(NbrAtom.GetIdx()): + AxisAtoms.append(NbrAtom) + else: + RotAtoms.append(NbrAtom) + FindSubstAtoms(NbrAtom, outAtom, RotAtoms) + + #Simple switch to help detect if the atoms are rotated the right way + WasAbove90 = False + angle = FindRotAngle(AxisAtoms[0], AxisAtoms[1], outAtom, norm) + if angle > 0.5*pi: + WasAbove90 = True + rotangle = 2*(angle-0.5*pi) + else: + WasAbove90 = False + rotangle = 2*(0.5*pi-angle) + OldAtomCoords = outAtom.GetVector() + print "Atom " + str(outAtom.GetAtomicNum()) + " will be rotated by " +\ + str(rotangle*57.3) + ' degrees' + RotateAtom(outAtom, AxisAtoms[0], AxisAtoms[1], rotangle) + angle2 = FindRotAngle(AxisAtoms[0], AxisAtoms[1], outAtom, norm) + + #if the atom is on the same side of the plane as it was, + # it has been rotated in the wrong direction + if ((angle2 > 0.5*pi) and WasAbove90) or ((angle2 < 0.5*pi) and not WasAbove90): + #Flip the sign of the rotation angle, restore the coords + #and rotate the atom in the opposite direction + print "Atom was rotated the wrong way, switching the direction" + rotangle = -rotangle + outAtom.SetVector(OldAtomCoords) + RotateAtom(outAtom, AxisAtoms[0], AxisAtoms[1], rotangle) + + RotatedAtoms = [] # Index to make sure that atoms are not rotated twice + for atom in RotAtoms: + if atom not in RotatedAtoms: + RotateAtom(atom, AxisAtoms[0], AxisAtoms[1], rotangle) + RotatedAtoms.append(atom) + else: + print "Atom already rotated, skipping" + + obconversion.SetOutFormat("sdf") + obconversion.WriteFile(obmol, f[:-4] + 'rot.sdf') + + +#Recursively finds all the atoms connected to the input +def FindSubstAtoms(atom, outAtom, al): + + indexes = [a.GetIdx() for a in al] + for NbrAtom in OBAtomAtomIter(atom): + + if (NbrAtom.GetIdx() not in indexes) and\ + (NbrAtom.GetIdx() != outAtom.GetIdx()): + al.append(NbrAtom) + FindSubstAtoms(NbrAtom, outAtom, al) + + +#Rotate atom around and axis by an angle +def RotateAtom(atom, AxisAtom1, AxisAtom2, angle): + + [u, v, w] = GetUnitVector(AxisAtom1, AxisAtom2) + [x, y, z] = [atom.x(), atom.y(), atom.z()] + [a, b, c] = [(AxisAtom1.x()+AxisAtom1.x())/2, (AxisAtom1.y()+AxisAtom1.y())/2,\ + (AxisAtom1.z()+AxisAtom1.z())/2] + + X = (a*(v**2 + w**2) - u*(b*v+c*w-u*x-v*y-w*z))*(1-cos(angle))+x*cos(angle)\ + +(-1*c*v+b*w-w*y+v*z)*sin(angle) + Y = (b*(u**2 + w**2) - v*(a*u+c*w-u*x-v*y-w*z))*(1-cos(angle))+y*cos(angle)\ + +(c*u-a*w+w*x-u*z)*sin(angle) #was _+_u*z)*sin(angle) + Z = (c*(u**2 + v**2) - w*(a*u+b*v-u*x-v*y-w*z))*(1-cos(angle))+z*cos(angle)\ + +(-1*b*u+a*v-v*x+u*y)*sin(angle) + + atom.SetVector(X, Y, Z) + + +def GetUnitVector(Atom1, Atom2): + vector = [] + vector.append(Atom2.x() - Atom1.x()) + vector.append(Atom2.y() - Atom1.y()) + vector.append(Atom2.z() - Atom1.z()) + + length = np.linalg.norm(vector) + return [x/length for x in vector] + + +#Finds the angle by which atoms need to be rotated by taking the angle +#the atom is out of the plane (the 2 neighbor atoms being the axis) +#and doubling it +def FindRotAngle(AxisAtom1, AxisAtom2, OutAtom, Normal): + + start = [] + start.append((AxisAtom1.x() + AxisAtom2.x())/2) + start.append((AxisAtom1.y() + AxisAtom2.y())/2) + start.append((AxisAtom1.z() + AxisAtom2.z())/2) + + vector = [] + vector.append(OutAtom.x() - start[0]) + vector.append(OutAtom.y() - start[1]) + vector.append(OutAtom.z() - start[2]) + + #Angle between plane normal and OOP atom + vangle = angle(vector, Normal) + + #print "Measured angle: " + str(vangle*57.3) + return vangle + + +def crossproduct(v1, v2): + product = [0, 0, 0] + product[0] = v1[1]*v2[2]-v1[2]*v2[1] + product[1] = v1[2]*v2[0]-v1[0]*v2[2] + product[2] = v1[0]*v2[1]-v1[1]*v2[0] + return product + + +def dotproduct(v1, v2): + return sum((a*b) for a, b in zip(v1, v2)) + + +def length(v): + return sqrt(dotproduct(v, v)) + + +def angle(v1, v2): + return acos(dotproduct(v1, v2) / (length(v1) * length(v2))) + + +"""Finds planes for every 3 atoms, calculates distances to the plane +for the other 2 atoms andchoose the plane with the smallest smallest distance +""" +def FindFuranPlane(mol, furan): + + atomIds = furan._path + atoms = [] + + for i in atomIds: + atoms.append(mol.GetAtom(i)) + + MinError = 100.0 + + for atom in atoms: + pats = [a for a in atoms if a != atom] + norm, d, error = LstSqPlane(pats[0], pats[1], pats[2], pats[3]) + if error < MinError: + MinError = error + MaxNorm = norm + MaxD = d + OutAtom = atom + + return MaxNorm, MaxD, OutAtom + + +#Given 3 atoms, finds a plane defined by a normal vector and d +def FindPlane(atom1, atom2, atom3): + + vector1 = [atom2.x() - atom1.x(), atom2.y() - atom1.y(), + atom2.z() - atom1.z()] + vector2 = [atom3.x() - atom1.x(), atom3.y() - atom1.y(), + atom3.z() - atom1.z()] + cross_product = [vector1[1] * vector2[2] - vector1[2] * vector2[1], + -1 * vector1[0] * vector2[2] - vector1[2] * vector2[0], + vector1[0] * vector2[1] - vector1[1] * vector2[0]] + + d = cross_product[0] * atom1.x() - cross_product[1] * atom1.y() + \ + cross_product[2] * atom1.z() + + return cross_product, d + + +def LstSqPlane(atom1, atom2, atom3, atom4): + + # Inital guess of the plane + [a0, b0, c0], d0 = FindPlane(atom1, atom2, atom3) + + f = lambda (a, b, c, d): PlaneError([atom1, atom2, atom3, atom4], a, b, c, d) + res = sciopt.minimize(f, (a0, b0, c0, d0), method='nelder-mead') + plane = list(res.x) + + return plane[:3], plane[3], f(plane) + + +def PlaneError(atoms, a, b, c, d): + dists = [] + for atom in atoms: + dists.append(abs(PointPlaneDist([a, b, c], d, atom))) + return sum(dists)/len(dists) + + +#Calculates distance from an atom to a plane +def PointPlaneDist(norm, d, atom): + + point = [] + + point.append(atom.x()) + point.append(atom.y()) + point.append(atom.z()) + + a = norm[0]*point[0] + norm[1]*point[1] + norm[2]*point[2] + d + b = sqrt(norm[0]**2 + norm[1]**2 + norm[2]**2) + + return a/b diff --git a/Gaussian.py b/Gaussian.py new file mode 100644 index 0000000..28200e9 --- /dev/null +++ b/Gaussian.py @@ -0,0 +1,437 @@ +#!/usr/bin/env python +from __future__ import division +# -*- coding: utf-8 -*- +""" +Created on Wed Nov 19 15:56:54 2014 + +@author: ke291 + +Contains all of the Gaussian specific code for input generation and calculation +execution. Called by PyDP4.py. +""" + +import Tinker +import MacroModel + +import subprocess +import socket +import os +import time +import sys +import glob +import pyximport +pyximport.install() +import ConfPrune + + +def SetupGaussian(MMoutp, Gausinp, numDigits, settings, adjRMSDcutoff): + + if settings.MMTinker: + #Reads conformer geometry, energies and atom labels from Tinker output + (atoms, conformers, charge) = Tinker.ReadTinker(MMoutp, settings) + else: + (atoms, conformers, charge) = MacroModel.ReadMacromodel(MMoutp, + settings) + + #Prune similar conformations, if the number exceeds the limit + if len(conformers) > settings.PerStructConfLimit: + pruned = ConfPrune.RMSDPrune(conformers, atoms, adjRMSDcutoff) + else: + pruned = conformers + + print str(len(conformers) - len(pruned)) +\ + " or " + "{:.1f}".format(100*(len(conformers) - len(pruned)) / + len(conformers))+"% of conformations have been pruned based on " +\ + str(adjRMSDcutoff) + " angstrom cutoff" + + for num in range(0, len(pruned)): + filename = Gausinp+str(num+1).zfill(3) + if not settings.DFTOpt: + WriteGausFile(filename, pruned[num], atoms, charge, settings) + else: + WriteGausFileOpt(filename, pruned[num], atoms, charge, settings) + + print str(len(pruned)) + " .com files written" + + +#Adjust the RMSD cutoff to keep the conformation numbers reasonable +def AdaptiveRMSD(MMoutp, settings): + + if settings.MMTinker: + #Reads conformer geometry, energies and atom labels from Tinker output + (atoms, conformers, charge) = Tinker.ReadTinker(MMoutp, settings) + else: + (atoms, conformers, charge) = MacroModel.ReadMacromodel(MMoutp, + settings) + + return ConfPrune.AdaptRMSDPrune(conformers, atoms, + settings.InitialRMSDcutoff, + settings.PerStructConfLimit) + + +def WriteGausFile(Gausinp, conformer, atoms, charge, settings): + + f = file(Gausinp + '.com', 'w') + f.write('%mem=2800MB\n%chk='+Gausinp + '.chk\n') + #f.write('# b3lyp/6-31g(d,p) Opt nmr=giao\n') + if settings.Solvent != '': + f.write('# b3lyp/6-31g(d,p) nmr=giao scrf=(solvent=' + + settings.Solvent+')\n') + else: + f.write('# b3lyp/6-31g(d,p) nmr=giao\n') + f.write('\n'+Gausinp+'\n\n') + f.write(str(charge) + ' 1\n') + + natom = 0 + + for atom in conformer: + f.write(atoms[natom] + ' ' + atom[1] + ' ' + atom[2] + ' ' + + atom[3] + '\n') + natom = natom + 1 + f.write('\n') + f.close() + + +def WriteGausFileOpt(Gausinp, conformer, atoms, charge, settings): + + #write the initial DFT geometry optimisation input file first + f1 = file(Gausinp + 'a.com', 'w') + f1.write('%mem=2800MB\n%chk='+Gausinp + '.chk\n') + + if settings.Solvent != '': + f1.write('# b3lyp/6-31g(d,p) Opt=(maxcycles=30) scrf=(solvent=' + + settings.Solvent+')\n') + else: + f1.write('# b3lyp/6-31g(d,p) Opt=(maxcycles=30)\n') + + f1.write('\n'+Gausinp+'\n\n') + f1.write(str(charge) + ' 1\n') + + natom = 0 + + for atom in conformer: + f1.write(atoms[natom] + ' ' + atom[1] + ' ' + atom[2] + ' ' + + atom[3] + '\n') + natom = natom + 1 + f1.write('\n') + f1.close() + + #Write the nmr prediction input file, + #using the geometry from checkpoint file + f2 = file(Gausinp + 'b.com', 'w') + f2.write('%mem=2800MB\n%chk='+Gausinp + '.chk\n') + if settings.Solvent != '': + f2.write('# b3lyp/6-31g(d,p) nmr=giao geom=checkpoint scrf=(solvent=' + + settings.Solvent+')\n') + else: + f2.write('# b3lyp/6-31g(d,p) nmr=giao geom=checkpoint\n') + f2.write('\n'+Gausinp+'\n\n') + f2.write(str(charge) + ' 1\n') + f2.write('\n') + f2.close() + + +def GetFiles2Run(inpfiles, settings): + #Get the names of all relevant input files + GinpFiles = [] + for filename in inpfiles: + if not settings.DFTOpt: + GinpFiles = GinpFiles + glob.glob(filename + 'ginp???.com') + else: + GinpFiles = GinpFiles + glob.glob(filename + 'ginp???a.com') + Files2Run = [] + + #for every input file check that there is a completed output file, + #delete the incomplete outputs and add the inputs to be done to Files2Run + for filename in GinpFiles: + if not settings.DFTOpt: + if not os.path.exists(filename[:-3]+'out'): + Files2Run.append(filename) + else: + if IsGausCompleted(filename[:-3] + 'out'): + #print filename[:-3]+'out already exists' + continue + else: + os.remove(filename[:-3] + 'out') + Files2Run.append(filename) + else: + if not os.path.exists(filename[:-5]+'.out'): + Files2Run.append(filename) + else: + if IsGausCompleted(filename[:-5] + '.out'): + #print filename[:-3]+'out already exists' + continue + else: + os.remove(filename[:-5] + '.out') + Files2Run.append(filename) + + return Files2Run + + +def IsGausCompleted(f): + Gfile = open(f, 'r') + outp = Gfile.readlines() + Gfile.close() + if len(outp) < 10: + return False + if "Normal termination" in outp[-1]: + return True + else: + return False + + +#Still need addition of support for geometry optimisation +def RunOnZiggy(folder, queue, GausFiles, settings): + + print "ziggy GAUSSIAN job submission script\n" + + #Check that folder does not exist, create job folder on ziggy + outp = subprocess.check_output('ssh ziggy ls', shell=True) + if folder in outp: + print "Folder exists on ziggy, choose another folder name." + return + + outp = subprocess.check_output('ssh ziggy mkdir ' + folder, shell=True) + + #Write the qsub scripts + for f in GausFiles: + if not settings.DFTOpt: + WriteSubScript(f[:-4], queue, folder, settings) + else: + WriteSubScriptOpt(f[:-4], queue, folder, settings) + print str(len(GausFiles)) + ' .qsub scripts generated' + + #Upload .com files and .qsub files to directory + print "Uploading files to ziggy..." + for f in GausFiles: + if not settings.DFTOpt: + outp = subprocess.check_output('scp ' + f +' ziggy:~/' + folder, + shell=True) + else: + outp = subprocess.check_output('scp ' + f[:-4] +'a.com ziggy:~/' + + folder, shell=True) + outp = subprocess.check_output('scp ' + f[:-4] +'b.com ziggy:~/' + + folder, shell=True) + outp = subprocess.check_output('scp ' + f[:-4] +'.qsub ziggy:~/' + + folder, shell=True) + + print str(len(GausFiles)) + ' .com and .qsub files uploaded to ziggy' + + #Launch the calculations + for f in GausFiles: + job = '~/' + folder + '/' + f[:-4] + outp = subprocess.check_output('ssh ziggy qsub -q ' + queue + ' -o ' + + job + '.log -e ' + job + '.err ' + job + '.qsub', shell=True) + + print str(len(GausFiles)) + ' jobs submitted to the queue on ziggy' + + outp = subprocess.check_output('ssh ziggy showq', shell=True) + if settings.user in outp: + print "Jobs are running on ziggy" + + Jobs2Complete = list(GausFiles) + n2complete = len(Jobs2Complete) + + #Check and report on the progress of calculations + while len(Jobs2Complete) > 0: + JustCompleted = [job for job in Jobs2Complete if + IsZiggyGComplete(job[:-3] + 'out', folder, settings)] + Jobs2Complete[:] = [job for job in Jobs2Complete if + not IsZiggyGComplete(job[:-3] + 'out', folder, settings)] + if n2complete != len(Jobs2Complete): + n2complete = len(Jobs2Complete) + print str(n2complete) + " remaining." + + time.sleep(60) + + #When done, copy the results back + print "\nCopying the output files back to localhost..." + print 'ssh ziggy scp /home/' + settings.user + '/' + folder + '/*.out ' +\ + socket.getfqdn() + ':' + os.getcwd() + outp = subprocess.check_output('ssh ziggy scp /home/' + settings.user + + '/' + folder + '/*.out ' + socket.getfqdn() + + ':' + os.getcwd(), shell=True) + + #Delete the *.chk files from /sharedscratch/ + print "\nCleaning up - deleting the *.chk files form /sharedscratch/" + for f in GausFiles: + outp = subprocess.check_output('ssh ziggy rm -r /sharedscratch/' + + settings.user + '/' + f[:-4], shell=True) + + +def WriteSubScript(GausJob, queue, ZiggyJobFolder, settings): + + if not (os.path.exists(GausJob+'.com')): + print "The input file " + GausJob + ".com does not exist. Exiting..." + return + + #Create the submission script + QSub = open(GausJob + ".qsub", 'w') + + #Choose the queue + QSub.write('#PBS -q ' + queue + '\n#PBS -l nodes=1:ppn=1\n#\n') + + #define input files and output files + QSub.write('file=' + GausJob + '\n\n') + QSub.write('inpfile=${file}.com\noutfile=${file}.out\n') + + #define cwd and scratch folder and ask the machine + #to make it before running the job + QSub.write('HERE=/home/' + settings.user +'/' + ZiggyJobFolder + '\n') + QSub.write('SCRATCH=/sharedscratch/' + settings.user + '/' + + GausJob + '\n') + QSub.write('mkdir ${SCRATCH}\n') + + #Setup GAUSSIAN environment variables + QSub.write('set OMP_NUM_THREADS=1\n') + QSub.write('export GAUSS_EXEDIR=/usr/local/shared/gaussian/em64t/09-D01/g09\n') + QSub.write('export g09root=/usr/local/shared/gaussian/em64t/09-D01\n') + QSub.write('export PATH=/usr/local/shared/gaussian/em64t/09-D01/g09:$PATH\n') + QSub.write('export GAUSS_SCRDIR=$SCRATCH\n') + + #copy the input file to scratch + QSub.write('cp ${HERE}/${inpfile} $SCRATCH\ncd $SCRATCH\n') + + #write useful info to the job output file (not the gaussian) + QSub.write('echo "Starting job $PBS_JOBID"\necho\n') + QSub.write('echo "PBS assigned me this node:"\ncat $PBS_NODEFILE\necho\n') + + QSub.write('ln -s $HERE/$outfile $SCRATCH/$outfile\n') + QSub.write('$GAUSS_EXEDIR/g09 < $inpfile > $outfile\n') + + #Cleanup + QSub.write('mkdir ${HERE}/${file}\n') + QSub.write('cp ${SCRATCH}/*.chk $HERE/${file}/\n') + QSub.write('rm -f ${SCRATCH}/*\n') + QSub.write('cp $HERE/${file}/*.chk ${SCRATCH}/\n') + QSub.write('rm -r ${HERE}/${file}\n') + QSub.write('qstat -f $PBS_JOBID\n') + + QSub.close() + +#Function to write ziggy script when dft optimisation is used +def WriteSubScriptOpt(GausJob, queue, ZiggyJobFolder, settings): + + if not (os.path.exists(GausJob+'a.com')): + print "The input file " + GausJob + "a.com does not exist. Exiting..." + return + if not (os.path.exists(GausJob+'b.com')): + print "The input file " + GausJob + "b.com does not exist. Exiting..." + return + + #Create the submission script + QSub = open(GausJob + ".qsub", 'w') + + #Choose the queue + QSub.write('#PBS -q ' + queue + '\n#PBS -l nodes=1:ppn=1\n#\n') + + #define input files and output files + QSub.write('file=' + GausJob + '\n\n') + QSub.write('inpfile1=${file}a.com\ninpfile2=${file}b.com\n') + QSub.write('outfile1=${file}temp.out\noutfile2=${file}.out\n') + + #define cwd and scratch folder and ask the machine + #to make it before running the job + QSub.write('HERE=/home/' + settings.user +'/' + ZiggyJobFolder + '\n') + QSub.write('SCRATCH=/sharedscratch/' + settings.user + '/' + GausJob + '\n') + QSub.write('mkdir ${SCRATCH}\n') + + #Setup GAUSSIAN environment variables + QSub.write('set OMP_NUM_THREADS=1\n') + QSub.write('export GAUSS_EXEDIR=/usr/local/shared/gaussian/em64t/09-D01/g09\n') + QSub.write('export g09root=/usr/local/shared/gaussian/em64t/09-D01\n') + QSub.write('export PATH=/usr/local/shared/gaussian/em64t/09-D01/g09:$PATH\n') + QSub.write('export GAUSS_SCRDIR=$SCRATCH\n') + + #copy the input files to scratch + QSub.write('cp ${HERE}/${inpfile1} $SCRATCH\n') + QSub.write('cp ${HERE}/${inpfile2} $SCRATCH\ncd $SCRATCH\n') + + #write useful info to the job output file (not the gaussian) + QSub.write('echo "Starting job $PBS_JOBID"\necho\n') + QSub.write('echo "PBS assigned me this node:"\ncat $PBS_NODEFILE\necho\n') + + QSub.write('ln -s $HERE/$outfile2 $SCRATCH/$outfile2\n') + QSub.write('$GAUSS_EXEDIR/g09 < $inpfile1 > $outfile1\n') + QSub.write('$GAUSS_EXEDIR/g09 < $inpfile2 > $outfile2\n') + + #Cleanup + QSub.write('mkdir ${HERE}/${file}\n') + QSub.write('cp ${SCRATCH}/*.chk $HERE/${file}/\n') + QSub.write('rm -f ${SCRATCH}/*\n') + QSub.write('cp $HERE/${file}/*.chk ${SCRATCH}/\n') + QSub.write('rm -r ${HERE}/${file}\n') + QSub.write('qstat -f $PBS_JOBID\n') + + QSub.close() + + +def IsZiggyGComplete(f, folder, settings): + + path = '/home/' + settings.user + '/' + folder + '/' + try: + outp1 = subprocess.check_output('ssh ziggy ls ' + path, shell=True) + except subprocess.CalledProcessError, e: + print "ssh ziggy ls failed: " + str(e.output) + return False + if f in outp1: + try: + outp2 = subprocess.check_output('ssh ziggy cat ' + path + f, + shell=True) + except subprocess.CalledProcessError, e: + print "ssh ziggy cat failed: " + str(e.output) + return False + if "Normal termination" in outp2: + return True + return False + + +""" +Change to support tautomers - treat a tautomer as a few extra conformers with +different file names +Correction: Treat them as diastereomers and submit to nmrpredict, but remember +that they are tautomers, so that their populations can be optimized +""" +def RunNMRPredict(numDS, *args): + + TautInputs = [] + Ntaut = [] + NumFiles = [] + arg = 0 + + #Pick all tautomer counts and filenames + for ds in range(0, numDS): + Ntaut.append(int(args[arg])) + arg = arg+1 + TautInputs.append([]) + for taut in range(0, Ntaut[ds]): + TautInputs[ds].append('') + NumFiles.append(0) + for f in glob.glob(args[arg] + 'ginp*.out'): + TautInputs[ds][taut] = TautInputs[ds][taut] + f[:-4] + ' ' + NumFiles[-1] = NumFiles[-1] + 1 + arg = arg+1 + + outputs = [] + + #This loop runs nmrPredict for each diastereomer and collects + #the outputs + """ To change: run nmrpredict for each tautomer seperately""" + for ds in range(0, numDS): + for taut in range(0, Ntaut[ds]): + + #Prepares input for nmrPredict + javafolder = getScriptPath() + jinp = 'CLASSPATH=' + javafolder + \ + ' java nmrPredictGaussian ' + TautInputs[ds][taut] + print jinp + + #Runs java nmrPredict JagName001, ... and collects output + outputs.append(subprocess.check_output(jinp, shell=True)) + #print '\n\n' + outputs[isomer] + + return (outputs, NumFiles, Ntaut) + + +def getScriptPath(): + return os.path.dirname(os.path.realpath(sys.argv[0])) diff --git a/InchiGen.py b/InchiGen.py new file mode 100644 index 0000000..fd8b7b6 --- /dev/null +++ b/InchiGen.py @@ -0,0 +1,666 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Dec 17 15:20:18 2014 + +@author: ke291 + +Code for diastereomer, tautomer and protomer generation via InChI strings. +This file gets called by PyDP4.py if diastereomer and/or tautomer and/or +protomer generation is used. +""" +import sys +sys.path.append('/home/ke291/Tools/openbabel-install/lib/python2.7/site-packages/') +from openbabel import * +import subprocess +import itertools + +MolConPath = '/home/ke291/ChemAxon/MarvinBeans/bin/molconvert' + + +def main(f): + + inchi, aux = GetInchi(f) + + print inchi + + ds_inchis = GenDiastereomers(inchi) + ds_inchis = [FixTautProtons(f, i, aux) for i in ds_inchis] + + for ds in range(0, len(ds_inchis)): + Inchi2Struct(ds_inchis[ds], f[:-4] + str(ds+1), aux) + RestoreNumsSDF(f[:-4] + str(ds+1) + '.sdf', f, aux) + + """taut_inchis = GenTautomers(inchi) + + for taut in range(0, len(taut_inchis)): + Inchi2Struct(taut_inchis[taut], 'taut' + str(taut), aux)""" + + +def GetInchiRenumMap(AuxInfo): + + for l in AuxInfo.split('/'): + if 'N:' in l: + RenumLayer = l + break + amap = [int(x) for x in RenumLayer[2:].split(',')] + return amap + + +def FixTautProtons(f, inchi, AuxInfo): + + #Get tautomeric protons and atoms they are connected to from Inchi + TautProts = GetTautProtons(inchi) + amap = GetInchiRenumMap(AuxInfo) + + #get the correspondence of the Inchi numbers to the source numbers + hmap = [] + for taut in TautProts: + for heavyatom in range(1, len(taut)): + hmap.append([int(taut[heavyatom]), amap[int(taut[heavyatom])-1]]) + + #Read molecule from file + obconversion = OBConversion() + obconversion.SetInFormat("sdf") + obmol = OBMol() + obconversion.ReadFile(obmol, f) + + Fixprotpos = [] + for heavyatom in hmap: + atom = obmol.GetAtom(heavyatom[1]) + for nbratom in OBAtomAtomIter(atom): + if nbratom.GetAtomicNum() == 1: + Fixprotpos.append(heavyatom[0]) + draftFH = [] + for i in range(0, len(Fixprotpos)): + if Fixprotpos[i] not in [a[0] for a in draftFH]: + draftFH.append([Fixprotpos[i], Fixprotpos.count(Fixprotpos[i])]) + + fixedlayer = '/f/h' + for h in draftFH: + if h[1] == 1: + fixedlayer = fixedlayer + str(h[0])+'H,' + else: + fixedlayer = fixedlayer + str(h[0])+'H' + str(h[1]) + ',' + + resinchi = inchi + fixedlayer[:-1] + + return resinchi + + +#Get H connections from sdf file +def GetHcons(f): + obconversion = OBConversion() + obconversion.SetInFormat("sdf") + obmol = OBMol() + obconversion.ReadFile(obmol, f) + Hcons = [] + for atom in OBMolAtomIter(obmol): + idx = atom.GetIdx() + anum = atom.GetAtomicNum() + if anum == 1: + for NbrAtom in OBAtomAtomIter(atom): + Hcons.append([idx, NbrAtom.GetIdx()]) + return Hcons + + +def RestoreNumsSDF(f, fold, AuxInfo): + + #Read molecule from file + obconversion = OBConversion() + obconversion.SetInFormat("sdf") + obmol = OBMol() + obconversion.ReadFile(obmol, f) + #Get the atoms Hs are connected to + oldHcons = GetHcons(fold) + #translate the H connected atoms to the new numbering system + amap = GetInchiRenumMap(AuxInfo) + for i in range(0, len(oldHcons)): + oldHcons[i][1] = amap.index(oldHcons[i][1])+1 + + newHcons = [] + temp = [] + i = 0 + for atom in OBMolAtomIter(obmol): + idx = atom.GetIdx() + anum = atom.GetAtomicNum() + #If atom is hydrogen, check what it is connected to + if anum == 1: + for NbrAtom in OBAtomAtomIter(atom): + newHcons.append([idx, NbrAtom.GetIdx()]) + #Pick the temporary atom + temp.append(atom) + + for i in range(0, len(newHcons)): + conatom = newHcons[i][1] + for b in range(0, len(oldHcons)): + if conatom == oldHcons[b][1]: + amap.append(oldHcons[b][0]) + #remove the number, so that it doesn't get added twice + oldHcons[b][1] = 0 + + newmol = OBMol() + added = [] + + for i in range(1, len(amap)+1): + newn = amap.index(i) + newmol.AddAtom(temp[newn]) + added.append(newn) + + #Final runthrough to check that all atoms have been added, + #tautomeric protons can be missed. If tautomeric proton tracking + #is implemented this can be removed + for i in range(0, len(temp)): + if not i in added: + newmol.AddAtom(temp[i]) + + #Restore the bonds + newmol.ConnectTheDots() + newmol.PerceiveBondOrders() + #Write renumbered molecule to file + obconversion.SetOutFormat("sdf") + obconversion.WriteFile(newmol, f) + + +def GetInchi(f): + + outp = subprocess.check_output(MolConPath + ' inchi ' + f, shell=True) + idata = outp.split('\n') + + aux = idata[1][:] + return idata[0], aux + + +def Inchi2Struct(inchi, f, aux): + + infile = open(f + '.inchi', 'w') + infile.write(inchi) + infile.close() + + outp = subprocess.check_output(MolConPath + ' sdf ' + f + + '.inchi -3:S{fine}[prehydrogenize] -o ' + f + '.sdf', shell=True) + + +def GenProtomers(structf, atoms): + + f = structf + ".sdf" + inchi, aux = GetInchi(f) + print inchi + amap = GetInchiRenumMap(aux) + + prot_atoms = [] + for atom in atoms: + prot_atoms.append(amap.index(atom)) + + finchi = FixTautProtons(f, inchi, aux) + prot_inchis = GenProtInchis(finchi, prot_atoms) + + print prot_inchis + filenames = [] + for prot in range(0, len(prot_inchis)): + Inchi2Struct(prot_inchis[prot], f[:-4] + 'p' + str(prot+1), aux) + RestoreNumsSDF(f[:-4] + 'p' + str(prot+1) + '.sdf', f, aux) + filenames.append(f[:-4] + 'p' + str(prot+1)) + return len(filenames), filenames + + +def GenProtInchis(inchi, atoms): + + #Read and interpret proton counts on heavy atoms from inchi, + # including tautomeric layers + protons, formula, tautomerics, fprotons = ReadProtonCounts(inchi) + + #Construct list of heavy atoms with tautomeric protons and which system + #they belong to + tlist = [[], []] + i = 0 + for tlayer in tautomerics: + temp = tlayer[1:-1].split(',') + tlist[0].extend([int(x) for x in temp[1:]]) + tlist[1].extend([i for x in temp[1:]]) + print tlist + #Increase H count and regenerate the formula + for i in range(0, len(formula)): + if formula[i][0] == 'H': + formula[i][1] += 1 + formula[i][1] = str(formula[i][1]) + formula = [''.join(x) for x in formula] + formula = ''.join(formula) + + #For each basic atom in atoms, generate a copy of original protons + #add atom and save it + print "Protonating atoms with these InChI numbers: " +\ + str([x+1 for x in atoms]) + protlayers = [] + fprotlayers = [] + for atom in atoms: + if atom+1 not in tlist[0]: + extraprotons = list(protons) + extrafprotons = list(fprotons) + extraprotons[atom] += 1 + if tautomerics == []: + protlayers.append(WriteProtonCounts(extraprotons)) + else: + protlayers.append(WriteProtonCounts(extraprotons) + + ',' + ','.join(tautomerics)) + fprotlayers.append(WriteProtonCounts(extrafprotons)) + else: + extraprotons = list(protons) + extrafprotons = list(fprotons) + extratautomerics = list(tautomerics) + extrafprotons[atom] += 1 + #which tautomeric system atom belongs to? + tindex = tlist[0].index(atom+1) + tindex = tlist[1][tindex] + temp = tautomerics[tindex].split(',') + #Get the proton count and increase by 1 + protcount = int(temp[0][2:]) + 1 + #Write the proton count back + temp[0] = temp[0][:2] + str(protcount) + extratautomerics[tindex] = ','.join(temp) + protlayers.append(WriteProtonCounts(extraprotons)+',' + + ','.join(extratautomerics)) + fprotlayers.append(WriteProtonCounts(extrafprotons)) + protinchis = [] + protinchis.append(inchi) + for l in range(0, len(protlayers)): + layers = inchi.split('/') + MainLayerPassed = False + ChargeAdded = False + i = 1 + while (i < len(layers)): + if 'h' in layers[i]: + if not MainLayerPassed: + layers[i] = protlayers[l] + MainLayerPassed = True + if 'q' not in inchi: + layers.insert(i+1, 'q+1') + ChargeAdded = True + else: + layers[i] = fprotlayers[l] + if ('q' in layers[i]) and ChargeAdded is False: + charge = int(layers[i][1:]) + layers[i] = 'q'+"%+d" % charge + if 'C' in layers[i] and 'H' in layers[i]: + layers[i] = formula + i += 1 + #insert charge layer here + protinchis.append('/'.join(layers)) + return protinchis + + +def WriteProtonCounts(protons): + collectedprotons = [[], [], [], []] + + i = 1 + lastcount = protons[0] + start = 0 + while i < len(protons): + if protons[i] != lastcount: + if start == i-1: + collectedprotons[lastcount].append(str(start+1)) + else: + collectedprotons[lastcount].append(str(start+1)+'-'+str(i)) + lastcount = protons[i] + start = i + i += 1 + + if start == i-1: + collectedprotons[lastcount].append(str(start+1)) + else: + collectedprotons[lastcount].append(str(start+1)+'-'+str(i)) + + hlayer = 'h' + if len(collectedprotons[1]) > 0: + hlayer += ','.join(collectedprotons[1]) + hlayer += 'H,' + if len(collectedprotons[2]) > 0: + hlayer += ','.join(collectedprotons[2]) + hlayer += 'H2,' + if len(collectedprotons[3]) > 0: + hlayer += ','.join(collectedprotons[3]) + hlayer += 'H3,' + hlayer = hlayer[:-1] + return hlayer + + +def ReadProtonCounts(inchi): + import re + + #Get inchi layers + layers = inchi.split('/') + ProtLayer = '' + FixedLayer = '' + for l in layers[1:]: + if 'C' in l and 'H' in l: + atoms = re.findall(r"[a-zA-Z]+", l) + indexes = [int(x) for x in re.findall(r"\d+", l)] + formula = [list(x) for x in zip(atoms, indexes)] + if 'h' in l and ProtLayer != '': + FixedLayer = l[1:] + if 'h' in l and ProtLayer == '': + ProtLayer = l[1:] + + #initialize proton list + nheavy = sum([x[1] for x in formula if x[0] != 'H']) + + #Find, save and remove tautomeric portions from main proton layer + tautomerics = re.findall(r"\(.*?\)", ProtLayer) + ProtLayer = re.sub(r"\(.*?\)", "", ProtLayer) + if ProtLayer[-1] == ',': + ProtLayer = ProtLayer[:-1] + + #Read the main and the fixed proton layer + protons = ReadPSections(ProtLayer, nheavy) + fprotons = ReadPSections(FixedLayer, nheavy) + + return protons, formula, tautomerics, fprotons + + +def ReadPSections(ProtLayer, nheavy): + import re + protons = [0 for x in range(0, nheavy)] + #seperate the 1proton, 2proton and 3proton atoms, then seperate the records + psections = [x for x in re.findall(r".*?(?=H)", ProtLayer) if x != ''] + secvals = [0 for x in range(0, len(psections))] + psections[0] = psections[0].split(',') + + #interpret each record and fill in the proton table + #start by finding the proton count value for each section + for i in range(1, len(psections)): + if psections[i][0] == ',': + secvals[i-1] = 1 + psections[i] = psections[i][1:].split(',') + else: + secvals[i-1] = int(psections[i][0]) + psections[i] = psections[i][2:].split(',') + if ProtLayer[-1] != 'H': + secvals[-1] = int(ProtLayer[-1]) + else: + secvals[-1] = 1 + + #now expand each entry in the sections and fill the corresponding value + #in proton table + for i in range(0, len(psections)): + for s in psections[i]: + if '-' in s: + [start, finish] = [int(x) for x in s.split('-')] + protons[start-1:finish] = [secvals[i] for x in + range(0, len(protons[start-1:finish]))] + else: + protons[int(s)-1] = secvals[i] + return protons + + +def GetTautProtons(inchi): + #get the tautomer layer and pickup the data + layers = inchi.split('/') + + for l in layers: + if 'h' in l: + ProtLayer = l + ProtList = list(ProtLayer) + starts = [] + ends = [] + for i in range(0, len(ProtList)): + if ProtList[i] == '(': + starts.append(i) + if ProtList[i] == ')': + ends.append(i) + TautProts = [] + for i in range(0, len(starts)): + TautProts.append((ProtLayer[starts[i]+1:ends[i]]).split(',')) + + return TautProts + + +def GenTautomers(structf): + + f = structf + ".sdf" + inchi, aux = GetInchi(f) + + abc = 'abcdefghijklmnopqrstuvwxyz' + + t_inchis = GenTautInchis(inchi, aux, structf) + filenames = [] + for ds in range(0, len(t_inchis)): + Inchi2Struct(t_inchis[ds], f[:-4] + abc[ds], aux) + RestoreNumsSDF(f[:-4] + abc[ds] + '.sdf', f, aux) + filenames.append(f[:-4] + abc[ds]) + return len(filenames), filenames + + +#For now only works on one tautomeric system - for nucleosides don't need more +def GenTautInchis(inchi, aux, structf): + + resinchis = [] # Inchis of all tautomers, including the parent structure + + #Get tautomeric protons and atoms they are connected to + TautProts = GetTautProtons(inchi) + + #The total number of tautomeric protons in the system + if str(TautProts[0][0][1:]) != '': + totprotons = int(str(TautProts[0][0][1:])) + else: + totprotons = 1 + + #Check for and remove non-hetero atoms + temp = [int(x) for x in TautProts[0][1:] if IsHetero(int(x), inchi)] + + #Get the numbering map, to see which atomes are the tautomeric ones in the + #original structure file. From there determine their type and valency + #based on connectivity + amap = GetInchiRenumMap(aux) + OldTautProts = [amap[prot-1] for prot in temp] + valencies = GetTautValency(structf, OldTautProts) + + #the multivalent atoms will always have at least one proton + superfixed = [] + for i in range(0, len(valencies)): + if valencies[i] > 1: + superfixed.append(TautProts[0][i+1]) + #TautProts.append(['H', TautProts[0][i+1]]) + + #Generate all the possible proton positions + #with repetitions for multivalency + fixedprotons = list(itertools.combinations(TautProts[0][1:], + r=totprotons-len(superfixed))) + + for i in range(0, len(fixedprotons)): + fixedprotons[i] = superfixed + list(fixedprotons[i]) + fixedprotons[i].sort(key=int) + + #Count the occurences of protons positions, save the counts and + #remove redundant positions + counts = [] + for i in range(0, len(fixedprotons)): + counts.append([]) + j = 0 + while j < len(fixedprotons[i]): + counts[i].append(fixedprotons[i].count(fixedprotons[i][j])) + if counts[i][-1] > 1: + for _ in range(0, counts[i][-1]-1): + fixedprotons[i].remove(fixedprotons[i][j]) + #j+=counts[i][-1] + j += 1 + + fixprefix = '/f/h' + tauts = [] + for i in range(0, len(fixedprotons)): + tauts.append(fixprefix) + for j in range(0, len(fixedprotons[i])): + if j > 0: + tauts[i] += ',' + tauts[i] += fixedprotons[i][j] + 'H' + if counts[i][j] > 1: + tauts[i] += str(counts[i][j]) + + for taut in tauts: + resinchis.append(inchi + taut) + + return resinchis + + +def GetTautValency(structf, tautatoms): + #Read molecule from file + obconversion = OBConversion() + obconversion.SetInFormat("sdf") + obmol = OBMol() + obconversion.ReadFile(obmol, structf + ".sdf") + + protvalency = [] + for idx in tautatoms: + atom = obmol.GetAtom(idx) + corenum = atom.GetAtomicNum() + #If atom is hydrogen, check what it is connected to + nheavynbr = 0 + for NbrAtom in OBAtomAtomIter(atom): + nbrnum = NbrAtom.GetAtomicNum() + if nbrnum > 1: + nheavynbr += 1 + if corenum == 8: + protvalency.append(1) + if corenum == 7: + protvalency.append(3-nheavynbr) + + return protvalency + + +#utility function that determines if an atom is a heteroatom +def IsHetero(n, inchi): + layers = inchi.split('/') + nC = int((layers[1].split('H'))[0][1:]) + if (n > nC): + return True + else: + return False + + +def GenSelectDiastereomers(structf, atoms): + + f = structf + ".sdf" + inchi, aux = GetInchi(f) + amap = GetInchiRenumMap(aux) + + translated_atoms = [] + for atom in atoms: + translated_atoms.append(amap.index(atom)+1) + + ds_inchis = GenSelectDSInchis(inchi, translated_atoms) + ds_inchis = [FixTautProtons(f, i, aux) for i in ds_inchis] + filenames = [] + for ds in range(0, len(ds_inchis)): + Inchi2Struct(ds_inchis[ds], f[:-4] + str(ds+1), aux) + RestoreNumsSDF(f[:-4] + str(ds+1) + '.sdf', f, aux) + filenames.append(f[:-4] + str(ds+1)) + return len(filenames), filenames + + +def GenSelectDSInchis(inchi, atoms): + #Inchis of all diastereomers, including the parent structure + resinchis = [] + + #get the number of potential diastereomers + layers = inchi.split('/') + for l in layers: + if 't' in l: + slayer = l + sc = l[1:].split(',') + + ignore = [] + for i in range(0, len(sc)): + if not int(sc[i][:-1]) in atoms: + ignore.append(sc[i]) + sc = [x for x in sc if x not in ignore] + + if len(sc) == 0: + "No stereocentres remaining, no diastereomers will be generated." + return 0 + + numds = 2**(len(sc)) + print "Number of diastereomers to be generated: " + str(numds) + temps = [] + #Generate inversion patterns - essentially just binary strings + for i in range(0, numds): + template = bin(i)[2:].zfill(len(sc)) + temps.append(template) + + #For each 1 in template, invert the corresponding stereocentre + #and add the resulting diastereomer to the list + invert = {'+': '-', '-': '+'} + + reslayers = [] + for ds in range(0, numds): + newds = list(sc) + for stereocentre in range(0, len(sc)): + if temps[ds][stereocentre] == '1': + tlist = list(newds[stereocentre]) + tlist[-1] = invert[tlist[-1]] + newds[stereocentre] = "".join(tlist) + newlayer = str(slayer) + for stereocentre in range(0, len(sc)): + newlayer = newlayer.replace(sc[stereocentre], newds[stereocentre]) + reslayers.append(newlayer) + print reslayers + resinchis = [] + for layer in reslayers: + resinchis.append(inchi.replace(slayer, layer)) + return resinchis + + +def GenDiastereomers(structf): + + f = structf + ".sdf" + inchi, aux = GetInchi(f) + + print inchi + + ds_inchis = GenDSInchis(inchi) + ds_inchis = [FixTautProtons(f, i, aux) for i in ds_inchis] + filenames = [] + for ds in range(0, len(ds_inchis)): + Inchi2Struct(ds_inchis[ds], f[:-4] + str(ds+1), aux) + RestoreNumsSDF(f[:-4] + str(ds+1) + '.sdf', f, aux) + filenames.append(f[:-4] + str(ds+1)) + return len(filenames), filenames + + +def GenDSInchis(inchi): + + ilist = list(inchi) + #Inchis of all diastereomers, including the parent structure + resinchis = [] + + #get the number of potential diastereomers + layers = inchi.split('/') + for l in layers: + if 't' in l: + numds = 2**(len(l.translate(None, 't,1234567890'))-1) + + print "Number of diastereomers to be generated: " + str(numds) + + #find configuration sites (+ and -) + bs = ilist.index('t') + es = ilist[bs:].index('/') + spos = [] + for s in range(bs, bs+es): + if ilist[s] == '+' or ilist[s] == '-': + spos.append(s) + + temps = [] + #Generate inversion patterns - essentially just binary strings + for i in range(0, numds): + template = bin(i)[2:].zfill(len(spos)-1) + temps.append(template) + + #For each 1 in template, invert the corresponding stereocentre + #and add the resulting diastereomer to the list + invert = {'+': '-', '-': '+'} + + for ds in range(0, numds): + t = list(ilist) + for stereocentre in range(1, len(spos)): + if temps[ds][stereocentre-1] == '1': + t[spos[stereocentre]] = invert[t[spos[stereocentre]]] + resinchis.append(''.join(t)) + + return resinchis diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a83d17d --- /dev/null +++ b/LICENSE @@ -0,0 +1,23 @@ +PyDP4 integrated workflow for the running of MM, DFT GIAO calculations and +DP4 analysis +v0.4 + +Copyright (c) 2015 Kristaps Ermanis, Jonathan M. Goodman + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/MacroModel.py b/MacroModel.py new file mode 100644 index 0000000..2fd9981 --- /dev/null +++ b/MacroModel.py @@ -0,0 +1,262 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Jun 12 13:15:47 2015 + +@author: ke291 + +Contains all of the MacroModel specific code for input generation, calculation +execution and output interpretation. Called by PyDP4.py. +""" + +import os +import sys +import subprocess +import shutil +import time + + +def SetupMacromodel(numDS, settings, *args): + + for f in args: + if settings.Rot5Cycle is True: + if not os.path.exists(f+'rot.sdf'): + import FiveConf + #Generate the flipped fivemembered ring, + #result is in '*rot.sdf' file + FiveConf.main(f + '.sdf', settings) + + scriptdir = getScriptPath() + cwd = os.getcwd() + + #Convert input structure to mae file + convinp = settings.SCHRODINGER + '/utilities/sdconvert -isd ' + outp = subprocess.check_output(convinp + f + '.sdf -omae ' + f + + '.mae', shell=True) + + #Copy default com file to directory + shutil.copyfile(scriptdir + '/default.com', cwd + '/' + f + '.com') + #Change input and output file names in com file + comf = open(f + '.com', 'r+') + com = comf.readlines() + com[0] = f + '.mae\n' + com[1] = f + '-out.mae\n' + + #Change the molecular mechanics step count in the com file + cycles = (str(settings.MMstepcount)).rjust(6) + temp = list(com[7]) + temp[7:13] = list(cycles) + com[7] = "".join(temp) + comf.truncate(0) + comf.seek(0) + comf.writelines(com) + + #Change the forcefield in the com file + if (settings.ForceField).lower() == "opls": + temp = list(com[3]) + + temp[11:13] = list('14') + com[3] = "".join(temp) + comf.truncate(0) + + comf.seek(0) + comf.writelines(com) + + comf.close() + + if settings.Rot5Cycle is True: + #Convert input structure to mae file + convinp = settings.SCHRODINGER + '/utilities/sdconvert -isd ' + outp = subprocess.check_output(convinp + f + 'rot.sdf -omae ' + f + + 'rot.mae', shell=True) + + #Copy default com file to directory + shutil.copyfile(scriptdir + '/default.com', cwd + '/' + f + + 'rot.com') + #Change input and output file names in com file + comf = open(f + 'rot.com', 'r+') + com = comf.readlines() + com[0] = f + 'rot.mae\n' + com[1] = f + 'rot-out.mae\n' + + #Change the molecular mechanics step count in the com file + cycles = (str(settings.MMstepcount)).rjust(6) + temp = list(com[7]) + temp[7:13] = list(cycles) + com[7] = "".join(temp) + comf.truncate(0) + comf.seek(0) + comf.writelines(com) + comf.close() + print "Macromodel input for " + f + " prepared." + + +def RunMacromodel(numDS, settings, *args): + #Run Macromodel conformation search for all diastereomeric inputs + NCompleted = 0 + + for ds in args: + if not os.path.exists(ds+'.log'): + print settings.SCHRODINGER + '/bmin ' + ds + outp = subprocess.check_output(settings.SCHRODINGER + '/bmin ' + + ds, shell=True) + else: + print ds + ".log exists, skipping" + continue + + time.sleep(60) + while(not IsMMCompleted(ds + '.log')): + time.sleep(30) + NCompleted = NCompleted + 1 + + if settings.Rot5Cycle is True: + print "Macromodel job " + str(NCompleted) + " of " + str(numDS*2)\ + + " completed." + + print settings.SCHRODINGER + '/bmin ' + ds + 'rot' + outp = subprocess.check_output(settings.SCHRODINGER + '/bmin ' + + ds+'rot', shell=True) + time.sleep(60) + while(not IsMMCompleted(ds + 'rot.log')): + time.sleep(30) + NCompleted = NCompleted + 1 + print "Macromodel job " + str(NCompleted) + " of " + str(numDS*2)\ + + " completed." + else: + print "Macromodel job " + str(NCompleted) + " of " + str(numDS) +\ + " completed." + + +def ReadMacromodel(MMoutp, settings): + + conformers = [] + conformer = -1 + AbsEs = [] + + atoms = [] + charge = 0 + MaeInps = [] + + MaeFile = file(MMoutp + '-out.mae', 'r') + MaeInps.append(MaeFile.readlines()) + MaeFile.close() + + if settings.Rot5Cycle is True: + MaeFile = file(MMoutp + 'rot-out.mae', 'r') + MaeInps.append(MaeFile.readlines()) + MaeFile.close() + + for MaeInp in MaeInps: + index = 0 + AbsEOffsets = [] + #find conformer description blocks + blocks = [] + for i in range(0, len(MaeInp)): + if 'f_m_ct' in MaeInp[i]: + blocks.append(i) + if 'p_m_ct' in MaeInp[i]: + blocks.append(i) + + #find absolute energy offsets + for block in blocks: + for i in range(block, len(MaeInp)): + if 'mmod_Potential_Energy' in MaeInp[i]: + AbsEOffsets.append(i-block) + break + + #Get absolute energies for conformers + for i in range(0, len(blocks)): + for line in range(blocks[i], len(MaeInp)): + if ':::' in MaeInp[line]: + AbsEs.append(float(MaeInp[line+AbsEOffsets[i]])) + break + + #find geometry descriptions for each block + for i in range(0, len(blocks)): + for line in (MaeInp[blocks[i]:]): + if 'm_atom' in line: + blocks[i] = blocks[i] + MaeInp[blocks[i]:].index(line) + break + + #find start of atom coordinates for each block + for i in range(0, len(blocks)): + for line in (MaeInp[blocks[i]:]): + if ':::' in line: + blocks[i] = blocks[i] + MaeInp[blocks[i]:].index(line) + break + + #Read the atom numbers and coordinates + for block in blocks: + conformers.append([]) + conformer = conformer + 1 + index = block+1 + atom = 0 + while not ':::' in MaeInp[index]: + line = MaeInp[index].split(' ') + line = [word for word in line if word != ''] + conformers[conformer].append([]) + if conformer == 0: + atoms.append(GetMacromodelSymbol(int(line[1]))) + conformers[0][atom].append(line[0]) # add atom number + conformers[0][atom].append(line[2]) # add X + conformers[0][atom].append(line[3]) # add Y + conformers[0][atom].append(line[4]) # add Z + charge = charge + int(line[21]) + else: + if blocks.index(block) == 0: + conformers[conformer][atom].append(line[0]) # add atom number + conformers[conformer][atom].append(line[2]) # add X + conformers[conformer][atom].append(line[3]) # add Y + conformers[conformer][atom].append(line[4]) # add Z + else: + conformers[conformer][atom].append(line[0]) # add atom number + conformers[conformer][atom].append(line[1]) # add X + conformers[conformer][atom].append(line[2]) # add Y + conformers[conformer][atom].append(line[3]) # add Z + + index = index + 1 # Move to next line + atom = atom + 1 # Move to next atom + #Pick only the conformers in the energy window + MinE = min(AbsEs) + + conformers2 = [] + for i in range(0, len(conformers)): + if AbsEs[i] < MinE+settings.MaxCutoffEnergy: + conformers2.append(conformers[i]) + + return atoms, conformers2, charge + + +def GetMacromodelSymbol(atomType): + + Lookup = ['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', + 'C', 'C', 'C', 'O', 'O', 'O', ' ', 'O', ' ', 'O', + 'O', ' ', 'O', 'N', 'N', 'N', ' ', ' ', ' ', ' ', + 'N', 'N', ' ', ' ', ' ', ' ', ' ', 'N', 'N', 'N', + 'H', 'H', 'H', 'H', 'H', ' ', ' ', 'H', 'S', ' ', + 'S', 'S', 'P', 'B', 'B', 'F', 'Cl', 'Br', 'I', 'Si', + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', + ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', 'S', + 'S', 'Cl', 'B', 'F', ' ', ' ', ' ', ' ', 'S', 'S', + ' ', ' ', 'S', 'S'] + + if Lookup[atomType-1] == ' ': + print 'Unknown atom type' + + return Lookup[atomType-1] + + +def getScriptPath(): + return os.path.dirname(os.path.realpath(sys.argv[0])) + + +def IsMMCompleted(f): + Gfile = open(f, 'r') + outp = Gfile.readlines() + Gfile.close() + + if "normal termination" in outp[-3]: + return True + else: + return False diff --git a/NMRDP4GTF.py b/NMRDP4GTF.py new file mode 100644 index 0000000..b8df8af --- /dev/null +++ b/NMRDP4GTF.py @@ -0,0 +1,469 @@ +# -*- coding: utf-8 -*- +""" +Created on Mon Jan 12 14:42:47 2015 + +@author: ke291 + +Takes care of all the NMR description interpretation, equivalent atom +averaging, Boltzmann averaging, tautomer population optimisation (if used) +and DP4 input preparation and running either DP4.jar or DP4.py. Called by +PyDP4.py +""" + +import Gaussian +import NWChem + +import subprocess +import sys +import scipy.optimize as sciopt +import math +import os + +TMS_SC_C13 = 191.69255 +TMS_SC_H1 = 31.7518583 + + +def main(numDS, settings, *args): + + #This function runs nmrPredict for each diastereomer and collects + #the outputs + print '\nRunning NMRpredict script...' + + if settings.DFT == 'z' or settings.DFT == 'g': + (outputs, NumFiles, Ntaut) = Gaussian.RunNMRPredict(numDS, *args[:-1]) + Noutp = len(outputs) + elif settings.DFT == 'n' or settings.DFT == 'w': + (RelEs, populations, labels, BoltzmannShieldings, Ntaut) = \ + NWChem.RunNMRPredict(numDS, *args) + Noutp = len(BoltzmannShieldings) + + #Reads the experimental NMR data from the file + ExpNMR = args[-1] + ExpNMR_file = open(ExpNMR, 'r') + Cexp = ExpNMR_file.readline() + ExpNMR_file.readline() + Hexp = ExpNMR_file.readline() + + expHlabels = [] + expHvalues = [] + + #Loops through the experimental proton NMR data and picks out values + # and atom numbers and places them in expHlabels, expHvalues, expClabels, + #expCvalues + + expbuf = Hexp.split(',') + + for i in range(0, len(expbuf)): + shiftend = expbuf[i].find('(') + expHvalues.append(float(expbuf[i][0:shiftend])) + labelend = expbuf[i].find(')') + expHlabels.append(expbuf[i][shiftend+1:labelend]) + + #Check if exp NMR file contains info about equivalent atoms and read it + #into an array + #Also reads a list of atoms to omit from analysis + + equivalents = [] + omits = [] + + ExpNMR_file.readline() + for line in ExpNMR_file: + if not 'OMIT' in line and len(line) > 1: + equivalents.append(line[:-1].split(',')) + else: + omits.append(line[5:-1].split(',')) + ExpNMR_file.close() + + Clabels = [] + Hlabels = [] + Cvalues = [] + Hvalues = [] + + #This loops through predictNMR outputs for each diastereomer and collects + #NMR data + + flatequiv = [val for sublist in equivalents for val in sublist] + flatomits = [val for sublist in omits for val in sublist] + + if settings.DFT == 'z' or settings.DFT == 'g': + for DS in range(0, len(outputs)): + + Cvalues.append([]) + Hvalues.append([]) + + nmrdata = outputs[DS].split('\n') + + RelEnergies = nmrdata[2].split(',') + buf = '' + for val in RelEnergies[1:NumFiles[DS]]: + num = float(val) + buf = buf + "{:5.2f}".format(num) + ', ' + buf = buf + "{:5.2f}".format(float(RelEnergies[NumFiles[DS]])) + print '\nConformer relative energies (kJ/mol): ' + buf + + #Print population(%) for diagnostic purposes + Populations = nmrdata[3].split(',') + buf = '' + for val in Populations[1:NumFiles[DS]]: + num = float(val) + buf = buf + "{:4.1f}".format(num) + ', ' + buf = buf + "{:4.1f}".format(float(Populations[NumFiles[DS]])) + print '\nPopulations (%): ' + buf + + #loops through particular output and collects shielding constants + #and calculates shifts relative to TMS + for row in nmrdata[4:]: + data = row.split(',') + shift = 0 + if len(data) > NumFiles[DS]: + if data[0][23] == 'C' and not data[0][23:] in flatomits: + # only read labels once, i.e. the first diastereomer + if DS == 0: + Clabels.append(str(data[0][23:])) + shift = (TMS_SC_C13-float(data[NumFiles[DS]+1])) / \ + (1-(TMS_SC_C13/10**6)) + Cvalues[DS].append(shift) + + if data[0][23] == 'H' and not data[0][23:] in flatomits: + # only read labels once, i.e. the first diastereomer + if DS == 0: + Hlabels.append(str(data[0][23:])) + shift = (TMS_SC_H1-float(data[NumFiles[DS]+1])) / \ + (1-(TMS_SC_H1/10**6)) + Hvalues[DS].append(shift) + + elif settings.DFT == 'n' or settings.DFT == 'w': + for DS in range(0, numDS): + + Cvalues.append([]) + Hvalues.append([]) + + buf = '' + for val in RelEs[DS]: + num = float(val) + buf = buf + "{:5.2f}".format(num) + ', ' + print '\nConformer relative energies (kJ/mol): ' + buf[:-2] + + buf = '' + for val in populations[DS]: + num = float(val) + buf = buf + "{:4.1f}".format(num*100) + ', ' + print '\nPopulations (%): ' + buf[:-2] + + #loops through particular output and collects shielding constants + #and calculates shifts relative to TMS + for atom in range(0, len(BoltzmannShieldings[DS])): + shift = 0 + if labels[atom][0] == 'C' and not labels[atom] in flatomits: + # only read labels once, i.e. the first diastereomer + if DS == 0: + Clabels.append(labels[atom]) + shift = (TMS_SC_C13-BoltzmannShieldings[DS][atom]) / \ + (1-(TMS_SC_C13/10**6)) + Cvalues[DS].append(shift) + + if labels[atom][0] == 'H' and not labels[atom] in flatomits: + # only read labels once, i.e. the first diastereomer + if DS == 0: + Hlabels.append(labels[atom]) + shift = (TMS_SC_H1-BoltzmannShieldings[DS][atom]) / \ + (1-(TMS_SC_H1/10**6)) + Hvalues[DS].append(shift) + + #Looks for equivalent atoms in the computational data, averages the shifts + #and removes the redundant signals + for eqAtoms in equivalents: + + eqSums = [0.0]*Noutp + eqAvgs = [0.0]*Noutp + + if eqAtoms[0][0] == 'H': + #print eqAtoms, Hlabels + for atom in eqAtoms: + eqIndex = Hlabels.index(atom) + for ds in range(0, Noutp): + eqSums[ds] = eqSums[ds] + Hvalues[ds][eqIndex] + for ds in range(0, Noutp): + eqAvgs[ds] = eqSums[ds]/len(eqAtoms) + + #Place the new average value in the first atom shifts place + target_index = Hlabels.index(eqAtoms[0]) + for ds in range(0, Noutp): + Hvalues[ds][target_index] = eqAvgs[ds] + + #Delete the redundant atoms from the computed list + #start with second atom - e.g. don't delete the original one + for atom in range(1, len(eqAtoms)): + del_index = Hlabels.index(eqAtoms[atom]) + del Hlabels[del_index] + for ds in range(0, Noutp): + del Hvalues[ds][del_index] + + if eqAtoms[0][0] == 'C': + for atom in eqAtoms: + eqIndex = Clabels.index(atom) + for ds in range(0, Noutp): + eqSums[ds] = eqSums[ds] + Cvalues[ds][eqIndex] + for ds in range(0, Noutp): + eqAvgs[ds] = eqSums[ds]/len(eqAtoms) + + #Place the new average value in the first atom shifts place + target_index = Clabels.index(eqAtoms[0]) + for ds in range(0, Noutp): + Cvalues[ds][target_index] = eqAvgs[ds] + + #Delete the redundant atoms from the computed list + #start with second atom - e.g. don't delete the original one + for atom in range(1, len(eqAtoms)): + del_index = Clabels.index(eqAtoms[atom]) + del Clabels[del_index] + for ds in range(0, Noutp): + del Cvalues[ds][del_index] + + tstart = 0 + OptCvalues = [] + OptHvalues = [] + + for tindex in range(0, len(Ntaut)): + print 'looking at tautomers ' + str(tstart) + ' to ' + \ + str(tstart+Ntaut[tindex]) + if Ntaut[tindex] == 1: + print "Only one tautomer found, skipping optimisation." + OptCvalues.append(Cvalues[tstart]) + OptHvalues.append(Hvalues[tstart]) + tstart = tstart + Ntaut[tindex] + else: + (BuffC, BuffH) = OptTautPop(Clabels, + Cvalues[tstart:tstart+Ntaut[tindex]], + Hlabels, + Hvalues[tstart:tstart+Ntaut[tindex]], + Cexp, Hexp) + OptCvalues.append(BuffC) + OptHvalues.append(BuffH) + tstart = tstart + Ntaut[tindex] + + #Output the seperated shifts to terminal and DP4 input file + #along with the experimental NMR data + if (not settings.PDP4) and (not settings.EP5): + + WriteDP4input(Clabels, OptCvalues, Cexp, Hlabels, OptHvalues, Hexp) + #Run the DP4 java file and collect the output + javafolder = getScriptPath() + DP4outp = subprocess.check_output('CLASSPATH=' + javafolder + + ' java -jar ' + javafolder + '/DP4.jar DP4inp.inp', + shell=True) + print '\n' + DP4outp + + else: + import DP4 + DP4outp = DP4.main(Clabels, OptCvalues, Hlabels, OptHvalues, Cexp, + Hexp, settings) + + return '\n'.join(DP4outp) + '\n' + + +def WriteDP4input(Clabels, Cvalues, Cexp, Hlabels, Hvalues, Hexp): + + print '\nWriting input file for DP4...' + DP4_file = open('DP4inp.inp', 'w') + + DP4_file.write(','.join(Clabels) + '\n') + print '\n' + ','.join(Clabels) + for line in Cvalues: + print ','.join(format(v, "4.2f") for v in line) + DP4_file.write(','.join(format(v, "4.2f") for v in line) + '\n') + + DP4_file.write('\n' + Cexp) + print '\n' + Cexp + + DP4_file.write('\n' + ','.join(Hlabels) + '\n') + print '\n' + ','.join(Hlabels) + for line in Hvalues: + print ','.join(format(v, "4.2f") for v in line) + DP4_file.write(','.join(format(v, "4.2f") for v in line) + '\n') + + DP4_file.write('\n' + Hexp + '\n') + print '\n' + Hexp + + DP4_file.close() + + +def OptTautPop(Clabels, Cvalues, Hlabels, Hvalues, Cexp, Hexp): + #Pairwise match exp signals to computed ones based on assignments first, + #on erorrs afterwards + ExpCvalues = [-1 for i in range(0, len(Clabels))] + ExpHvalues = [-1 for i in range(0, len(Hlabels))] + + Hdata = Hexp.split(',') + for s in range(0, len(Hdata)): + Hdata[s] = Hdata[s].strip() + + Cdata = Cexp.split(',') + for s in range(0, len(Cdata)): + Cdata[s] = Cdata[s].strip() + + UAExpCshifts = list(Cdata) + UAExpHshifts = list(Hdata) + + #Assign known(experimentally assigned) signals first + for l in range(0, len(Clabels)): + for s in Cdata: + if (Clabels[l] + ')') in s and (not 'or' in s) and (not 'OR' in s): + shiftend = s.find('(') + ExpCvalues[l] = float(s[:shiftend]) + UAExpCshifts.remove(s) + break + for l in range(0, len(Hlabels)): + for s in Hdata: + if (Hlabels[l] + ')') in s and (not 'or' in s) and (not 'OR' in s): + shiftend = s.find('(') + ExpHvalues[l] = float(s[:shiftend]) + UAExpHshifts.remove(s) + break + + #Prepare unassigned experimental values for matching + for i in range(0, len(UAExpHshifts)): + shiftend = UAExpHshifts[i].find('(') + UAExpHshifts[i] = float(UAExpHshifts[i][:shiftend]) + + for i in range(0, len(UAExpCshifts)): + shiftend = UAExpCshifts[i].find('(') + UAExpCshifts[i] = float(UAExpCshifts[i][:shiftend]) + + #Try to assign unassigned values based on every calculated tautomer + MinMAE = 1000 + for k in range(0, len(Cvalues)): + #Pick out unassigned computational values for matching + UACompCshifts = [] + UACompHshifts = [] + for i in range(0, len(ExpCvalues)): + if ExpCvalues[i] == -1: + UACompCshifts.append(Cvalues[k][i]) + for i in range(0, len(ExpHvalues)): + if ExpHvalues[i] == -1: + UACompHshifts.append(Hvalues[k][i]) + + #Sort both sets of data - this essentially pairs them up + UAExpCshifts.sort() + UAExpHshifts.sort() + UACompCshifts.sort() + UACompHshifts.sort() + + #Go through half-assigned experimental data and fill in the holes with + #paired data + for i in range(0, len(ExpCvalues)): + if ExpCvalues[i] == -1 and len(UAExpCshifts) > 0: + j = UACompCshifts.index(Cvalues[k][i]) + ExpCvalues[i] = UAExpCshifts[j] + for i in range(0, len(ExpHvalues)): + if ExpHvalues[i] == -1 and len(UAExpHshifts) > 0: + j = UACompHshifts.index(Hvalues[k][i]) + ExpHvalues[i] = UAExpHshifts[j] + + #Optimize tautomer populations for + #tpops is 1 shorter than the number of tautomers, + #the remaining weight is 1-sum(rest) + tpops = [1.0/len(Cvalues) for i in range(len(Cvalues)-1)] + f = lambda w: TautError(Cvalues, ExpCvalues, Hvalues, ExpHvalues, w) + res = sciopt.minimize(f, tpops, method='nelder-mead') + if float(res.fun) < MinMAE: + print "New min MAE: " + str(res.fun) + MinMAE = float(res.fun) + NewPops = list(res.x) + NewPops.append(1-sum(NewPops)) + print NewPops + + NewCvalues = [] + NewHvalues = [] + + #calculate the new C values + for atom in range(0, len(Clabels)): + C = 0 + for taut in range(0, len(Cvalues)): + C = C + Cvalues[taut][atom]*NewPops[taut] + NewCvalues.append(C) + + for atom in range(0, len(Hlabels)): + H = 0 + for taut in range(0, len(Hvalues)): + H = H + Hvalues[taut][atom]*NewPops[taut] + NewHvalues.append(H) + + #Return the new Cvalues and Hvalues + return (NewCvalues, NewHvalues) + + +def TautError(Cs, CExp, Hs, HExp, TPopsIn): + + if len(Cs) != len(TPopsIn)+1 or len(Hs) != len(TPopsIn)+1: + print len(Cs), len(Hs), len(TPopsIn) + print ("Input dimensions in TautError don't match, exiting...") + return 1000 + TPops = list(TPopsIn) + TPops.append(1-sum(TPops)) + SumC = [] + SumH = [] + #print len(Cs), len(Hs), len(TPops) + #print TPops + for i in range(0, len(TPops)): + if TPops[i] < 0: + return 100 + if sum(TPops) > 1: + s = sum(TPops) + for i in range(0, len(TPops)): + TPops[i] = TPops[i]/s + + for atom in range(0, len(CExp)): + C = 0 + for taut in range(0, len(Cs)): + C = C + Cs[taut][atom]*TPops[taut] + SumC.append(C) + + for atom in range(0, len(HExp)): + H = 0 + for taut in range(0, len(Hs)): + H = H + Hs[taut][atom]*TPops[taut] + SumH.append(H) + + ErrC = MAE(SumC, CExp) + #ErrC = RMSE(SumC, CExp) + ErrH = MAE(SumH, HExp) + #ErrH = RMSE(SumH, HExp) + #print 'MAE for C: ' + str(ErrC) + #print 'MAE for H: ' + str(ErrH) + + return ErrC + 20*ErrH + + +def getScriptPath(): + return os.path.dirname(os.path.realpath(sys.argv[0])) + + +def MAE(L1, L2): + + if len(L1) != len(L2): + return -1 + else: + L = [] + for i in range(0, len(L1)): + L.append(abs(L1[i]-L2[i])) + return sum(L)/len(L) + + +def RMSE(L1, L2): + + if len(L1) != len(L2): + return -1 + else: + L = [] + for i in range(0, len(L1)): + L.append((L1[i]-L2[i])**2) + return math.sqrt(sum(L)/len(L)) + +if __name__ == '__main__': + #print sys.argv + cpargs = sys.argv[2:] + numDS = int(sys.argv[1]) + for ds in range(0, numDS): + cpargs[ds*2+1] = int(cpargs[ds*2+1]) + main(numDS, *cpargs) diff --git a/NWChem.py b/NWChem.py new file mode 100644 index 0000000..98e8e27 --- /dev/null +++ b/NWChem.py @@ -0,0 +1,308 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Nov 19 15:56:54 2014 + +@author: ke291 + +Contains all of the NWChem specific code for input generation and calculation +execution. Called by PyDP4.py. +""" + +import Tinker +import MacroModel +import nmrPredictNWChem + +import pyximport +pyximport.install() +import ConfPrune +import glob +import os +import subprocess +import time +import socket + + +def SetupNWChem(MMoutp, NWCheminp, numDigits, settings, adjRMSDcutoff): + + #Reads conformer geometry, energies and atom labels from Tinker output + #(atoms, conformers) = ReadConformers(MMoutp, MaxEnergy) + + if settings.MMTinker: + #Reads conformer geometry, energies and atom labels from Tinker output + (atoms, conformers, charge) = Tinker.ReadTinker(MMoutp, settings) + else: + (atoms, conformers, charge) = MacroModel.ReadMacromodel(MMoutp, + settings) + + #Prune similar conformations, if the number exceeds the limit + if len(conformers) > settings.PerStructConfLimit: + pruned = ConfPrune.RMSDPrune(conformers, atoms, adjRMSDcutoff) + else: + pruned = conformers + + print str(len(conformers) - len(pruned)) +\ + " or " + "{:.1f}".format(100 * (len(conformers) - len(pruned)) / + len(conformers)) + "% of conformations have been pruned based on " + \ + str(adjRMSDcutoff) + " angstrom cutoff" + + for num in range(0, len(pruned)): + filename = NWCheminp+str(num+1).zfill(3) + WriteNWChemFile(filename, pruned[num], atoms, charge, settings) + + print str(len(pruned)) + " .nw files written" + + +#Adjust the RMSD cutoff to keep the conformation numbers reasonable +def AdaptiveRMSD(MMoutp, settings): + + if settings.MMTinker: + #Reads conformer geometry, energies and atom labels from Tinker output + (atoms, conformers, charge) = Tinker.ReadTinker(MMoutp, settings) + else: + (atoms, conformers, charge) = MacroModel.ReadMacromodel(MMoutp, + settings) + + return ConfPrune.AdaptRMSDPrune(conformers, atoms, + settings.InitialRMSDcutoff, + settings.PerStructConfLimit) + + +def WriteNWChemFile(NWCheminp, conformer, atoms, charge, settings): + + f = file(NWCheminp + '.nw', 'w') + f.write('memory stack 1500 mb heap 1500 mb global 3000 mb\n') + if settings.DFT == 'w': + f.write('scratch_dir /scratch/' + settings.user + '/' + NWCheminp + '\n') + f.write('echo\n\nstart molecule\n\ntitle "'+NWCheminp+'"\n') + f.write('echo\n\nstart\n\n') + f.write('charge ' + str(charge) + '\n\n') + f.write('geometry units angstroms print xyz autosym\n') + + natom = 0 + for atom in conformer: + f.write(' ' + atoms[natom] + ' ' + atom[1] + ' ' + atom[2] + ' ' + + atom[3] + '\n') + natom = natom + 1 + + f.write('end\n\nbasis\n * library 6-31G**\nend\n\n') + f.write('dft\n xc b3lyp\n mult 1\nend\n\n') + if settings.Solvent != "": + f.write('cosmo\n do_cosmo_smd true\n solvent ' + settings.Solvent + '\n') + f.write('end\n\n') + f.write('task dft energy\n\n') + f.write('property\n shielding\nend\n') + f.write('task dft property\n') + f.close() + + +def GetFiles2Run(inpfiles, settings): + #Get the names of all relevant input files + NinpFiles = [] + for filename in inpfiles: + NinpFiles = NinpFiles + glob.glob(filename + 'nwinp???.nw') + + Files2Run = [] + + #for every input file check that there is a completed output file, + #delete the incomplete outputs and add the inputs to be done to Files2Run + for filename in NinpFiles: + if not os.path.exists(filename[:-3]+'.nwo'): + Files2Run.append(filename) + else: + if IsNWChemCompleted(filename[:-3] + '.nwo'): + continue + else: + os.remove(filename[:-3] + '.nwo') + Files2Run.append(filename) + + return Files2Run + + +def RunNWChem(inpfiles, settings): + + NCompleted = 0 + NWChemPrefix = "nwchem " + + for f in inpfiles: + print NWChemPrefix + f + ' > ' + f[:-2] + 'nwo' + outp = subprocess.check_output(NWChemPrefix + f + ' > ' + f[:-2] + + 'nwo', shell=True) + NCompleted += 1 + print "NWChem job " + str(NCompleted) + " of " + str(len(inpfiles)) + \ + " completed." + + +def RunNMRPredict(numDS, *args): + + NWNames = [] + NTaut = [] + + for val in range(0, numDS): + NTaut.append(args[val*2]) + NWNames.append(args[val*2+1]) + + RelEs = [] + populations = [] + BoltzmannShieldings = [] + + print NWNames + print NTaut + #This loop runs nmrPredict for each diastereomer and collects + #the outputs + for isomer in NWNames: + + NWFiles = glob.glob(isomer + 'nwinp*.nwo') + for f in range(0, len(NWFiles)): + NWFiles[f] = NWFiles[f][:-4] + + #Runs nmrPredictNWChem Name001, ... and collects output + (x, y, labels, z) = nmrPredictNWChem.main(*NWFiles) + RelEs.append(x) + populations.append(y) + BoltzmannShieldings.append(z) + + return (RelEs, populations, labels, BoltzmannShieldings, NTaut) + + +def IsNWChemCompleted(f): + Nfile = open(f, 'r') + outp = Nfile.readlines() + Nfile.close() + outp = "".join(outp) + if "AUTHORS" in outp: + return True + else: + return False + + +def RunOnZiggy(folder, queue, NWFiles, settings): + + print "ziggy NWChem job submission script\n" + + #Check that folder does not exist, create job folder on ziggy + outp = subprocess.check_output('ssh ziggy ls', shell=True) + if folder in outp: + print "Folder exists on ziggy, choose another folder name." + return + + outp = subprocess.check_output('ssh ziggy mkdir ' + folder, shell=True) + #Write the qsub scripts + for f in NWFiles: + WriteSubScript(f[:-3], queue, folder, settings) + print str(len(NWFiles)) + ' .qsub scripts generated' + + #Upload .com files and .qsub files to directory + print "Uploading files to ziggy..." + for f in NWFiles: + outp = subprocess.check_output('scp ' + f +' ziggy:~/' + folder, + shell=True) + outp = subprocess.check_output('scp ' + f[:-3] +'.qsub ziggy:~/' + + folder, shell=True) + + print str(len(NWFiles)) + ' .nw and .qsub files uploaded to ziggy' + + #Launch the calculations + for f in NWFiles: + job = '~/' + folder + '/' + f[:-3] + outp = subprocess.check_output('ssh ziggy qsub -q ' + queue + ' -o ' + + job + '.log -e ' + job + '.err -l nodes=1:ppn=1:ivybridge ' + + job + '.qsub', shell=True) + time.sleep(3) + + print str(len(NWFiles)) + ' jobs submitted to the queue on ziggy' + + outp = subprocess.check_output('ssh ziggy showq', shell=True) + if settings.user in outp: + print "Jobs are running on ziggy" + + Jobs2Complete = list(NWFiles) + n2complete = len(Jobs2Complete) + + #Check and report on the progress of calculations + while len(Jobs2Complete) > 0: + JustCompleted = [job for job in Jobs2Complete if + IsZiggyGComplete(job[:-2] + 'nwo', folder, settings)] + Jobs2Complete[:] = [job for job in Jobs2Complete if + not IsZiggyGComplete(job[:-2] + 'nwo', folder, settings)] + if n2complete != len(Jobs2Complete): + n2complete = len(Jobs2Complete) + print str(n2complete) + " remaining." + + time.sleep(60) + + #When done, copy the results back + print "\nCopying the output files back to localhost..." + print 'ssh ziggy scp /home/' + settings.user + '/' + folder + '/*.nwo ' +\ + socket.getfqdn() + ':' + os.getcwd() + outp = subprocess.check_output('ssh ziggy scp /home/' + settings.user + '/' + + folder + '/*.nwo ' + socket.getfqdn() + ':' + + os.getcwd(), shell=True) + + +def WriteSubScript(NWJob, queue, ZiggyJobFolder, settings): + + if not (os.path.exists(NWJob+'.nw')): + print "The input file " + NWJob + ".nw does not exist. Exiting..." + return + + #Create the submission script + QSub = open(NWJob + ".qsub", 'w') + + #Choose the queue + QSub.write('#PBS -q ' + queue + '\n#PBS -l nodes=1:ppn=1\n#\n') + + #define input files and output files + QSub.write('file=' + NWJob + '\n\n') + QSub.write('inpfile=${file}.nw\noutfile=${file}.nwo\n') + + #define cwd and scratch folder and ask the machine + #to make it before running the job + QSub.write('HERE=/home/' + settings.user + '/' + ZiggyJobFolder + '\n') + QSub.write('SCRATCH=/sharedscratch/' + settings.user + '/' + NWJob + '\n') + QSub.write('LSCRATCH=/scratch/' + settings.user + '/' + NWJob + '\n') + QSub.write('mkdir ${SCRATCH}\n') + QSub.write('mkdir ${LSCRATCH}\n') + + #load relevant modules + QSub.write('set OMP_NUM_THREADS=1\n') + QSub.write('module load anaconda\n') + QSub.write('module load gcc/4.8.3\n') + QSub.write('module load mpi/openmpi/gnu/1.8.1\n') + QSub.write('module load nwchem\n') + + #copy the input file to scratch + QSub.write('cp ${HERE}/${inpfile} $SCRATCH\ncd $SCRATCH\n') + + #write useful info to the job output file (not the gaussian) + QSub.write('echo "Starting job $PBS_JOBID"\necho\n') + QSub.write('echo "PBS assigned me this node:"\ncat $PBS_NODEFILE\necho\n') + + QSub.write('ln -s $HERE/$outfile $SCRATCH/$outfile\n') + QSub.write('nwchem $inpfile > $outfile\n') + + #Cleanup + QSub.write('rm -rf ${SCRATCH}/\n') + QSub.write('rm -rf ${LSCRATCH}/\n') + QSub.write('qstat -f $PBS_JOBID\n') + + QSub.close() + + +def IsZiggyGComplete(f, folder, settings): + + path = '/home/' + settings.user + '/' + folder + '/' + try: + outp1 = subprocess.check_output('ssh ziggy ls ' + path, shell=True) + except subprocess.CalledProcessError, e: + print "ssh ziggy ls failed: " + str(e.output) + return False + if f in outp1: + try: + outp2 = subprocess.check_output('ssh ziggy cat ' + path + f, + shell=True) + except subprocess.CalledProcessError, e: + print "ssh ziggy cat failed: " + str(e.output) + return False + if "AUTHORS" in outp2: + return True + return False diff --git a/NucCErr.pkl b/NucCErr.pkl new file mode 100644 index 0000000..f6f629b Binary files /dev/null and b/NucCErr.pkl differ diff --git a/NucCErrOpt.pkl b/NucCErrOpt.pkl new file mode 100644 index 0000000..e763dd4 Binary files /dev/null and b/NucCErrOpt.pkl differ diff --git a/NucHErr.pkl b/NucHErr.pkl new file mode 100644 index 0000000..8b7f33f Binary files /dev/null and b/NucHErr.pkl differ diff --git a/NucHErrOpt.pkl b/NucHErrOpt.pkl new file mode 100644 index 0000000..ccda636 Binary files /dev/null and b/NucHErrOpt.pkl differ diff --git a/PyDP4.py b/PyDP4.py new file mode 100755 index 0000000..4aaeb2b --- /dev/null +++ b/PyDP4.py @@ -0,0 +1,435 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +PyDP4 integrated workflow for the running of MM, DFT GIAO calculations and +DP4 analysis +v0.4 + +Copyright (c) 2015 Kristaps Ermanis, Jonathan M. Goodman + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +Created on Wed Nov 19 15:26:32 2014 +Updated on Mon Jul 30 2015 + +@author: ke291 + +The main file, that should be called to start the PyDP4 workflow. +Interprets the arguments and takes care of the general workflow logic. +""" + +from __future__ import division + +import Gaussian +import NMRDP4GTF +import Tinker +import MacroModel +import NWChem + +import glob +import sys +import os +import datetime +import argparse +import math + + +#Assigning the config default values +class Settings: + MMTinker = True + MMMacromodel = False + DFT = 'z' + Rot5Cycle = False + RingAtoms = [] + GenDS = True + GenTaut = False + GenProt = False + Solvent = '' + DFTOpt = False + PDP4 = False + EP5 = False + queue = 's1' + TinkerPath = '~/tinker7/bin/scan ' + OBPath = '/home/ke291/Tools/openbabel-install/lib/python2.7/site-packages/' + SCHRODINGER = '/usr/local/shared/schrodinger/current' + ScriptDir = '' + user = 'ke291' + MMstepcount = 6000 + MMfactor = 500 # nsteps = MMfactor*degrees of freedom + HardConfLimit = 10000 + MaxConcurrentJobs = 75 + PerStructConfLimit = 100 + InitialRMSDcutoff = 0.75 + MaxCutoffEnergy = 10.0 + NTaut = 1 + LogFile = 'PyDP4.log' + AssumeDone = False + GenOnly = False + SelectedStereocentres = [] + charge = None + BasicAtoms = [] + ForceField = 'mmff' + +settings = Settings() + + +def main(filename, ExpNMR, nfiles): + + print "==========================" + print "PyDP4 script,\nintegrating Tinker/MacroModel," + print "Gaussian/NWChem and DP4\nv0.4" + print "\nCopyright (c) 2015 Kristaps Ermanis, Jonathan M. Goodman" + print "Distributed under MIT license" + print "==========================\n\n" + + if nfiles < settings.NTaut or nfiles % settings.NTaut != 0: + print "Invalid number of tautomers/input files - number of input files\ + must be a multiple of number of tautomers" + quit() + + #Check the number of input files, generate some if necessary + if nfiles == 1: + import InchiGen + if len(settings.SelectedStereocentres) > 0: + numDS, inpfiles = InchiGen.GenSelectDiastereomers(filename, + settings.SelectedStereocentres) + else: + numDS, inpfiles = InchiGen.GenDiastereomers(filename) + if settings.GenTaut: + newinpfiles = [] + for ds in inpfiles: + print "Generating tautomers for " + ds + settings.NTaut, files = InchiGen.GenTautomers(ds) + newinpfiles.extend(files) + inpfiles = list(newinpfiles) + if settings.GenProt: + newinpfiles = [] + for ds in inpfiles: + print "Generating protomers for " + ds + settings.NTaut, files = InchiGen.GenProtomers(ds, + settings.BasicAtoms) + newinpfiles.extend(files) + inpfiles = list(newinpfiles) + else: + inpfiles = filename + if settings.GenTaut: + numDS = nfiles + import InchiGen + newinpfiles = [] + for ds in inpfiles: + print "Generating tautomers for " + ds + settings.NTaut, files = InchiGen.GenTautomers(ds) + newinpfiles.extend(files) + inpfiles = list(newinpfiles) + else: + numDS = int(nfiles/settings.NTaut) + if (numDS == 1): + import InchiGen + for f in filename: + tdiastereomers = [] + numDS, tinpfiles = InchiGen.GenDiastereomers(f) + tdiastereomers.append(tinpfiles) + tinpfiles = zip(*tdiastereomers) + inpfiles = [] + for ds in tinpfiles: + inpfiles.extend(list(ds)) + + print inpfiles + + #Check the existence of mm output files + MMRun = False + + if settings.MMTinker: + #Check if there already are Tinker output files with the right names + tinkfiles = glob.glob('*.tout') + mminpfiles = [] + for filename in inpfiles: + if filename + '.tout' in tinkfiles and (filename + 'rot.tout' in + tinkfiles or settings.Rot5Cycle is False): + if len(mminpfiles) == 0: + MMRun = True + else: + MMRun = False + mminpfiles.append(filename) + else: + #Check if there already are Tinker output files with the right names + mmfiles = glob.glob('*.log') + mminpfiles = [] + for filename in inpfiles: + if filename + '.log' in mmfiles and (filename + 'rot.log' in + mmfiles or settings.Rot5Cycle is False): + if len(mminpfiles) == 0: + MMRun = True + else: + MMRun = False + mminpfiles.append(filename) + + if MMRun: + print 'Conformation search has already been run for these inputs.\ + \nSkipping...' + if settings.GenOnly: + print "Input files generated, quitting..." + quit() + else: + if settings.MMTinker: + print 'Some Tinker files missing.' + print '\Seting up Tinker files...' + Tinker.SetupTinker(len(inpfiles), settings, *mminpfiles) + if settings.GenOnly: + print "Input files generated, quitting..." + quit() + print '\nRunning Tinker...' + Tinker.RunTinker(len(inpfiles), settings, *mminpfiles) + else: + print 'Some Macromodel files missing.' + print '\nSetting up Macromodel files...' + MacroModel.SetupMacromodel(len(inpfiles), settings, *mminpfiles) + if settings.GenOnly: + print "Input files generated, quitting..." + quit() + print '\nRunning Macromodel...' + MacroModel.RunMacromodel(len(inpfiles), settings, *mminpfiles) + + if not settings.AssumeDone: + if settings.DFT == 'z' or settings.DFT == 'g': + adjRMSDcutoff = Gaussian.AdaptiveRMSD(inpfiles[0], settings) + elif settings.DFT == 'n' or settings.DFT == 'w': + adjRMSDcutoff = NWChem.AdaptiveRMSD(inpfiles[0], settings) + print 'RMSD cutoff adjusted to ' + str(adjRMSDcutoff) + #Run NWChem setup script for every diastereomer + print '\nRunning DFT setup...' + i = 1 + for ds in inpfiles: + if settings.DFT == 'z' or settings.DFT == 'g': + print "\nGaussian setup for file " + ds + " (" + str(i) +\ + " of " + str(len(inpfiles)) + ")" + Gaussian.SetupGaussian(ds, ds + 'ginp', 3, settings, + adjRMSDcutoff) + elif settings.DFT == 'n' or 'w': + print "\nNWChem setup for file " + ds +\ + " (" + str(i) + " of " + str(len(inpfiles)) + ")" + NWChem.SetupNWChem(ds, ds + 'nwinp', 3, settings, + adjRMSDcutoff) + i += 1 + QRun = False + else: + QRun = True + + if settings.DFT == 'z' or settings.DFT == 'g': + Files2Run = Gaussian.GetFiles2Run(inpfiles, settings) + elif settings.DFT == 'n' or 'w': + Files2Run = NWChem.GetFiles2Run(inpfiles, settings) + print Files2Run + if len(Files2Run) == 0: + QRun = True + + if len(Files2Run) > settings.HardConfLimit: + print "Hard conformation count limit exceeded, DFT calculations aborted." + quit() + + if QRun: + print 'DFT has already been run for these inputs. Skipping...' + else: + if settings.DFT == 'z': + print '\nRunning Gaussian on Ziggy...' + + #Run Gaussian jobs on Ziggy cluster in folder named after date + #and time in the short 1processor job queue + #and wait until the last file is completed + now = datetime.datetime.now() + MaxCon = settings.MaxConcurrentJobs + if settings.DFTOpt: + for i in range(len(Files2Run)): + Files2Run[i] = Files2Run[i][:-5] + '.com' + if len(Files2Run) < MaxCon: + Gaussian.RunOnZiggy(now.strftime('%d%b%H%M'), settings.queue, + Files2Run, settings) + else: + print "The DFT calculations will be done in " +\ + str(math.ceil(len(Files2Run)/MaxCon)) + " batches" + i = 0 + while((i+1)*MaxCon < len(Files2Run)): + print "Starting batch nr " + str(i+1) + Gaussian.RunOnZiggy(now.strftime('%d%b%H%M')+str(i+1), + settings.queue, Files2Run[(i*MaxCon):((i+1)*MaxCon)], settings) + i += 1 + print "Starting batch nr " + str(i+1) + Gaussian.RunOnZiggy(now.strftime('%d%b%H%M')+str(i+1), + settings.queue, Files2Run[(i*MaxCon):], settings) + + elif settings.DFT == 'n': + print '\nRunning NWChem locally...' + NWChem.RunNWChem(Files2Run, settings) + elif settings.DFT == 'w': + print '\nRunning NWChem on Ziggy...' + + #Run NWChem jobs on Ziggy cluster in folder named after date + #and time in the short 1 processor job queue + #and wait until the last file is completed + now = datetime.datetime.now() + MaxCon = settings.MaxConcurrentJobs + if len(Files2Run) < MaxCon: + NWChem.RunOnZiggy(now.strftime('%d%b%H%M'), settings.queue, + Files2Run, settings) + else: + print "The DFT calculations will be done in " +\ + str(math.ceil(len(Files2Run)/MaxCon)) + " batches" + i = 0 + while((i+1)*MaxCon < len(Files2Run)): + print "Starting batch nr " + str(i+1) + NWChem.RunOnZiggy(now.strftime('%d%b%H%M')+str(i+1), + settings.queue, Files2Run[(i*MaxCon):((i+1)*MaxCon)], settings) + i += 1 + print "Starting batch nr " + str(i+1) + NWChem.RunOnZiggy(now.strftime('%d%b%H%M')+str(i+1), + settings.queue, Files2Run[(i*MaxCon):], settings) + + if (numDS < 2): + print "DP4 requires at least 2 candidate structures!" + else: + allargs = [] + for i in range(numDS): + allargs.append(settings.NTaut) + allargs.extend(inpfiles[i*settings.NTaut:(i+1)*settings.NTaut]) + allargs.append(ExpNMR) + DP4outp = NMRDP4GTF.main(numDS, settings, *allargs) + print '\nWriting the DP4 output to DP4outp' + if not settings.EP5: + if nfiles == 1: + DP4_ofile = open(filename + '.dp4', 'w') + else: + DP4_ofile = open(filename[0] + '.dp4', 'w') + else: + if nfiles == 1: + DP4_ofile = open(filename + '_emp.dp4', 'w') + else: + DP4_ofile = open(filename[0] + '_emp.dp4', 'w') + DP4_ofile.write(DP4outp) + DP4_ofile.close() + print 'DP4 process completed successfully.' + + +def getScriptPath(): + return os.path.dirname(os.path.realpath(sys.argv[0])) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='PyDP4 script to setup\ + and run Tinker, Gaussian (on ziggy) and DP4') + parser.add_argument('-m', '--mm', help="Select molecular mechanics program,\ + t for tinker or m for macromodel, default is t", choices=['t', 'm'], + default='t') + parser.add_argument('-d', '--dft', help="Select DFT program, j for Jaguar,\ + g for Gaussian, n for NWChem, z for Gaussian on ziggy, w for NWChem on\ + ziggy, default is z", choices=['j', 'g', 'n', 'z', 'w'], default='z') + parser.add_argument('--StepCount', help="Specify\ + stereocentres for diastereomer generation") + parser.add_argument('StructureFiles', nargs='+', default=['-'], help= + "One or more SDF file for the structures to be verified by DP4. At least one\ + is required, if automatic diastereomer and tautomer generation is used.\ + One for each candidate structure, if automatic generation is not used") + parser.add_argument("ExpNMR", help="Experimental NMR description, assigned\ + with the atom numbers from the structure file") + parser.add_argument("-s", "--solvent", help="Specify solvent to use\ + for dft calculations") + parser.add_argument("-q", "--queue", help="Specify queue for job submission\ + on ziggy", default='s1') + parser.add_argument("-t", "--ntaut", help="Specify number of explicit\ + tautomers per diastereomer given in structure files, must be a multiple\ + of structure files", type=int, default=1) + parser.add_argument("-r", "--rot5", help="Manually generate conformers for\ + 5-memebered rings", action="store_true") + parser.add_argument('--ra', help="Specify ring atoms, for the ring to be\ + rotated, useful for molecules with several 5-membered rings") + parser.add_argument("--AssumeDFTDone", help="Assume RMSD pruning, DFT setup\ + and DFT calculations have been run already", action="store_true") + parser.add_argument("-g", "--GenOnly", help="Only generate diastereomers\ + and tinker input files, but don't run any calculations", action="store_true") + parser.add_argument('-c', '--StereoCentres', help="Specify\ + stereocentres for diastereomer generation") + parser.add_argument('-T', '--GenTautomers', help="Automatically generate\ + tautomers", action="store_true") + parser.add_argument('-o', '--DFTOpt', help="Optimize geometries at DFT\ + level before NMR prediction", action="store_true") + parser.add_argument('--pd', help="Use python port of DP4", action="store_true") + parser.add_argument('--ep5', help="Use EP5", action="store_true") + parser.add_argument('-n', '--Charge', help="Specify\ + charge of the molecule. Do not use when input files have different charges") + parser.add_argument('-b', '--BasicAtoms', help="Generate protonated states\ + on the specified atoms and consider as tautomers") + parser.add_argument('-f', '--ff', help="Selects force field for the \ + conformational search, implemented options 'mmff' and 'opls' (2005\ + version)", choices=['mmff', 'opls'], default='mmff') + args = parser.parse_args() + print args.StructureFiles + print args.ExpNMR + settings.NTaut = args.ntaut + settings.DFT = args.dft + settings.queue = args.queue + settings.ScriptDir = getScriptPath() + settings.ForceField = args.ff + + if args.pd: + settings.PDP4 = True + settings.EP5 = False + else: + settings.PDP4 = False + + if args.ep5: + settings.EP5 = True + settings.PDP4 = False + else: + settings.EP5 = False + + if args.mm == 't': + settings.MMTinker = True + settings.MMMacromodel = False + else: + settings.MMMacromodel = True + settings.MMTinker = False + if args.DFTOpt: + settings.DFTOpt = True + if args.BasicAtoms is not None: + settings.BasicAtoms =\ + [int(x) for x in (args.BasicAtoms).split(',')] + settings.GenProt = True + if args.StepCount is not None: + settings.MMstepcount = int(args.StepCount) + if args.Charge is not None: + settings.charge = int(args.Charge) + if args.GenTautomers: + settings.GenTaut = True + if args.StereoCentres is not None: + settings.SelectedStereocentres =\ + [int(x) for x in (args.StereoCentres).split(',')] + if args.GenOnly: + settings.GenOnly = True + if args.AssumeDFTDone: + settings.AssumeDone = True + if args.solvent: + settings.Solvent = args.solvent + if args.rot5: + settings.Rot5Cycle = True + if args.ra is not None: + settings.RingAtoms =\ + [int(x) for x in (args.ra).split(',')] + if len(args.StructureFiles) == 1: + main(args.StructureFiles[0], args.ExpNMR, 1) + else: + main(args.StructureFiles, args.ExpNMR, len(args.StructureFiles)) + #main() diff --git a/README b/README new file mode 100644 index 0000000..515d494 --- /dev/null +++ b/README @@ -0,0 +1,273 @@ +=============================================================== + +PyDP4 workflow integrating MacroModel/TINKER, Gaussian/NWChem +and DP4 analysis + +version 0.4 + +Copyright (c) 2015 Kristaps Ermanis, Jonathan M. Goodman + +distributed under MIT license + +=============================================================== + +CONTENTS +1) Requirements and Setup +2) Usage +3) NMR Description Format +4) Included Utilites +5) Code Organization + +=============================================================== + +REQUIREMENTS AND SETUP + +All the python and Java files and one utility to convert to and from TINKER +nonstandard xyz file are in the attached archive. They are set up to work +from a centralised location. It is not a requirement, but it is probably +best to add the location to the PATH variable. + +The script currently is set up to use MacroModel for molecular mechanics and +NWChem for DFT and it runs NWChem locally. Gaussian and TINKER is also supported +This setup has several requirements. + +1) One should have MacroModel or TINKER and NWChem or Gaussian. +The beginning PyDP4.py file contains a structure "Settings", where the location +of the TINKER scan executable or MacroModel bmin executable should be specified. + +2) The various manipulations of sdf files (renumbering, ring corner flipping) +requires OpenBabel, including Python bindings. The following links provide +instructions for building OpenBabel with Python bindings: +http://openbabel.org/docs/dev/UseTheLibrary/PythonInstall.html +http://openbabel.org/docs/dev/Installation/install.html#compile-bindings +The Settings structure contain path to OpenBabel, but currently it also +needs to be specified in InchiGen.py and FiveConf.py +This dependency can be ignored, if no diastereomer generation or +5 membered ring flipping is done. + +3) Finally, to run calculations on a computational cluster, a passwordless +ssh connection should be set up in both directions - +desktop -> cluster and cluster -> desktop. In most cases the modification +of the relevant functions in Gaussian.py or NWChem.py will be required +to fit your situation. + +4) All development and testing was done on Linux. However, both the scripts +and all the backend software should work equally well on windows with little +modification. + +=================== + +USAGE + +To call the script: +1) With all diastereomer generation: + +PyDP4.py Candidate CandidateNMR + +where Candidate is the sdf file containing 3D coordinates of the candidate +structure (without the extension), +and CandidateNMR contains the NMR description. The NMR description largely +follows the DP4 format, but see bellow for differences. + +Alternatively: + +PyDP4.py -s chloroform Candidate CandidateNMR + +specifies solvent for DFT calculation. If solvent is not given, no solvent is used. + +2) With explicit diastereomer/other candidate structures: + +PyDP4.py Candidate1 Candidate2 Candidate3 ... CandidateNMR + +The script does not attempt to generate diastereomers, simply carries out the +DP4 on the specified candidate structures. + +Script has several other switches, including switching the molecular mechanics and dft software etc. + + -m {t,m}, --mm {t,m} Select molecular mechanics program, t for tinker or m + for macromodel, default is t + + -d {j,g,n,z,w}, --dft {j,g,n,z,w} + Select DFT program, j for Jaguar, g for Gaussian, n + for NWChem, z for Gaussian on ziggy, w for NWChem on + ziggy, default is z (jaguar is not yet implemented) + + --StepCount STEPCOUNT + Specify stereocentres for diastereomer generation + + -s SOLVENT, --solvent SOLVENT + Specify solvent to use for dft calculations + + -q QUEUE, --queue QUEUE + Specify queue for job submission on ziggy + (default is s1) + + -t NTAUT, --ntaut NTAUT + Specify number of explicit tautomers per diastereomer + given in structure files, must be a multiple of + structure files + + -r, --rot5 Manually generate conformers for 5-memebered rings + + --ra RA Specify ring atoms, for the ring to be rotated, useful + for molecules with several 5-membered rings + + --AssumeDFTDone Assume RMSD pruning, DFT setup and DFT calculations + have been run already (saves time when repeating DP4 + analysis) + + -g, --GenOnly Only generate diastereomers and tinker input files, + but don't run any calculations (useful for diastereomer + generation for calculations ran on computers + without OpenBabel) + + -c STEREOCENTRES, --StereoCentres STEREOCENTRES + Specify stereocentres for diastereomer generation + + -T, --GenTautomers Automatically generate tautomers + + -o, --DFTOpt Optimize geometries at DFT level before NMR prediction + + --pd Use python port of DP4 + + -b BASICATOMS, --BasicAtoms BASICATOMS + Generate protonated states on the specified atoms and + consider as tautomers + +More information on those can be obtained by running PyDP4.py -h + +====================== + +NMR DESCRIPTION FORMAT + +NMRFILE example begins: +59.58(C3),127.88(C11),127.52(C10),115.71(C9),157.42(C8),133.98(C23),118.22(C22),115.79(C21),158.00(C20),167.33(C1),59.40(C2),24.50(C31),36.36(C34),71.05(C37),142.14(C42),127.50(C41),114.64(C40),161.02(C39) + +4.81(H5),7.18(H15),6.76(H14),7.22(H28),7.13(H27),3.09(H4),1.73(H32 or H33),1.83(H32 or H33),1.73(H36 or H35),1.73(H36 or H35),4.50(H38),7.32(H47),7.11(H46) + +H15,H16 +H14,H17 +H28,H29 +H27,H30 +H47,H48 +H46,H49 +C10,C12 +C9,C13 +C22,C24 +C21,C25 +C41,C43 +C40,C44 + +OMIT H19,H51 + +:example ends + +Sections are seperated by empty lines. +1) The first section is assigned C shifts, can also be (any). +2) Second section is (un)assigned H shifts. +3) This section defines chemically equivalent atoms. Each line is a new set, +all atoms in a line are treated as equivalent, their computed shifts averaged. +4) Final section, starting with a keyword OMIT defines atoms to be ignored. +Atoms defined in this section do not need a corresponding shift in the NMR +description + + +===================== + +UTILITIES + +There are 2 utilities included, not necessary for the process, but sometimes +useful. + +If the DP4 workflow fails at the TINKER stage, the 2 likely reasons are either +lack of 1gb of free memory or TINKER not accepting the numbering of the sdf +file (this is a bug in TINKER). The latter can be fixed by running the following +script: + +TreeRenum.py Candidate CandidateNMR + +It takes the sdf file and performs a spanning tree renumbering - making sure, +that there are as many connected atoms in sequence as possible. So far this +has always solved the TINKER problem. +The script also renumbers the NMR description file, if it contains any atom numbers. +The renumbered files are saved as Candidater and CandidateNMRr (r appended to their +original name). + +---------------------- +Another utility is NMRhelper (called by simply typing NMRhelper.py in shell). +It is a script with GUI interface, that assists in describing and assigning the +NMR. In the top textbox a structure file can be chosen. This allows the utility +to automatically detect protons attached to heteroatoms and add them to the +OMIT list, as well as detect the chemically eqivalent atoms (currently only +implemented for methyl groups). It also lets the script to help tracking which +atoms are yet to be assigned (show in the bottom 2 text boxes). +The next 2 large textboxes are for pasting raw NMR descriptions.Based on the pasted +text, the script will try to detect the shifts and make up a rough draft of the +description file. After this the final version can be prepared in the main textbox. +At any point the button to generate the NMR file can be pressed and this will write +the file to the NMRhelper folder with the name CandidateNMR, where Candidate is the +name of the structure file. +IMPORTANT NOTE: Do not edit the raw data textboxes, if you have done any work in the +main textbox, as this will cause the main textbox to revert to the rough +automatically generated version + + +===================== + +CODE ORGANIZATION + +The code is organized in several python script files, as well as several java +files. + +PyDP4.py +Main file, that should be called to start the PyDP4 workflow. Interprets the +arguments and takes care of the general workflow logic. + +InchiGen.py +Gets called if diastereomer and/or tautomer and/or protomer generation is +used. Called by PyDP4.py. + +FiveConf.py +Gets called if automatic 5-membered cycle corner-flipping is used. Called by +PyDP4.py. + +MacroModel.py +Contains all of the MacroModel specific code for input generation, calculation +execution and output interpretation. Called by PyDP4.py. + +Tinker.py +Contains all of the Tinker specific code for input generation, calculation +execution and output interpretation. Called by PyDP4.py. + +ConfPrune.pyx +Cython file for conformer alignment and RMSD pruning. Called by Gaussian.py +and NWChem.py + +Gaussian.py +Contains all of the Gaussian specific code for input generation and calculation +execution. Called by PyDP4.py. + +NWChem.py +Contains all of the NWChem specific code for input generation and calculation +execution. Called by PyDP4.py. + +NMRDP4GTF.py +Takes care of all the NMR description interpretation, equivalent atom +averaging, Boltzmann averaging, tautomer population optimisation (if used) +and DP4 input preparation and running either DP4.jar or DP4.py. Called by +PyDP4.py + +nmrPredictNWChem.py +Extracts NMR shifts from NWChem output files + +nmrPredictGaussian.java +Extracts NMR shifts from Gaussian output files + +DP4.jar +Original DP4 implementation as in J. Am. Chem. Soc. 2010, 132, 12946. + +DP4.py +Equivalent and compact port to python of the same DP4 process. The results +produced are essentially equivalent, but not identical due to different +floating point precision used in the Python (53 bits) and Java (32 bits) +implementation. diff --git a/Tinker.py b/Tinker.py new file mode 100755 index 0000000..8edbb08 --- /dev/null +++ b/Tinker.py @@ -0,0 +1,264 @@ +# -*- coding: utf-8 -*- +""" +Created on Fri Jun 12 12:52:21 2015 + +@author: ke291 + +Contains all of the Tinker specific code for input generation, calculation +execution and output interpretation. Called by PyDP4.py. +""" + +import os +import sys +import subprocess + + +def SetupTinker(numDS, settings, *args): + + print args + for ds in args: + if settings.Rot5Cycle is True: + if not os.path.exists(ds+'rot.sdf'): + import FiveConf + #Generate the flipped fivemembered ring, + #result is put in '*rot.sdf' file + FiveConf.main(ds + '.sdf', settings) + + #Makes sure that the name in the title line matches filename + #sdf2tinkerxyz uses this as the output file name + f = open(ds + '.sdf', 'r+') + sdf = f.readlines() + sdf[0] = ds + '\n' + f.seek(0) + f.writelines(sdf) + f.close() + + if settings.Rot5Cycle is True: + f = open(ds + 'rot.sdf', 'r+') + sdf = f.readlines() + sdf[0] = ds + 'rot\n' + if ds in sdf[-3]: + sdf[-3] = ds + 'rot.1\n' + f.seek(0) + f.writelines(sdf) + f.close() + + scriptdir = getScriptPath() + convinp = 'sdf2tinkerxyz -k ' + scriptdir + '/default.key <' + + outp = subprocess.check_output(convinp + ds + '.sdf', shell=True) + + print "Tinker input for " + ds + " prepared." + + if settings.Rot5Cycle is True: + outp = subprocess.check_output(convinp + ds + 'rot.sdf', + shell=True) + print "Tinker input for " + ds + "rot prepared." + + +def RunTinker(numDS, settings, *args): + #Run Tinker scan for all diastereomeric inputs + + NCompleted = 0 + + for ds in args: + print settings.TinkerPath + ds + ' 0 10 20 0.00001 | tee ./' + ds + \ + '.tout' + outp = subprocess.check_output(settings.TinkerPath + ds + + ' 0 10 20 0.00001 | tee ./' + ds + '.tout', shell=True) + NCompleted = NCompleted + 1 + print "Tinker job " + str(NCompleted) + " of " + str(numDS) + \ + " completed." + + if settings.Rot5Cycle is True: + print settings.TinkerPath + ds + 'rot 0 10 20 0.00001 | tee ./' + \ + ds + 'rot.tout' + outp = subprocess.check_output(settings.TinkerPath + ds + + 'rot 0 10 20 0.00001 | tee ./' + ds + 'rot.tout', shell=True) + NCompleted = NCompleted + 1 + print "Tinker job " + str(NCompleted) + " of " + str(numDS*2) + \ + " completed." + + +#Reads the relevant tinker geometry files +#v0.2 - reads seperate rot file as well +def ReadTinker(TinkerOutput, settings): + + #Get conformer energies + ETable, charge = GetEnergiesCharge(TinkerOutput, settings) + + if settings.Rot5Cycle is True: + #Get conformer energies for the flipped 5-membered ring + ETableRot, charge = GetEnergiesCharge(TinkerOutput + 'rot', settings) + + #Determine which conformers we want + MinE = 10000 + MinE = min([float(x[1]) for x in ETable]) + + if settings.Rot5Cycle is True: + MinERot = 10000 + MinERot = min([float(x[1]) for x in ETableRot]) + if MinE > MinERot: + MinE = MinERot + + FileNums = [] + RotFileNums = [] + + AcceptedEs = [] + + for conf in ETable: + if float(conf[1]) < MinE + settings.MaxCutoffEnergy: + #Dealing with special case when nconf>100 000 + if 'Minimum' in conf[0]: + data = conf[0].strip() + FileNums.append(data[7:].strip()) + else: + FileNums.append(conf[0].strip()) + AcceptedEs.append(float(conf[1])) + + if settings.Rot5Cycle is True: + for conf in ETableRot: + if float(conf[1]) < MinE + settings.MaxCutoffEnergy: + RotFileNums.append(conf[0].strip()) + AcceptedEs.append(float(conf[1])) + + print "Number of accepted conformers by energies: " + str(len(AcceptedEs)) + + Files = [] + #Generate conformer filenames + for num in FileNums: + Files.append(TinkerOutput + '.' + num.zfill(3)) + if settings.Rot5Cycle is True: + for num in RotFileNums: + Files.append(TinkerOutput + 'rot.' + num.zfill(3)) + + conformers = [] + conformer = 0 + atoms = [] + + for f in Files: + conformers.append([]) + + atom = 0 + infile = open(f, 'r') + inp = infile.readlines() + + for line in inp[1:]: + data = line.split(' ') + data = filter(None, data) + if conformer == 0: + atoms.append(GetTinkerSymbol(int(data[5]))) # Add atom symbol + conformers[conformer].append([]) # Add new atom + conformers[conformer][atom].append(data[0]) # add atom number + conformers[conformer][atom].append(data[2]) # add X + conformers[conformer][atom].append(data[3]) # add Y + conformers[conformer][atom].append(data[4]) # add Z + atom = atom + 1 # Move to the next atom + + infile.close() + conformer = conformer + 1 # Move to the next conformer + + return atoms, conformers, charge + + +# Get energies of conformers from tinker output file +def GetEnergiesCharge(TinkerOutput, settings): + + infile = open(TinkerOutput + '.tout', 'r') + + inp = infile.readlines() + if len(inp) < 56: + print "Tinker output " + TinkerOutput + " is corrupted, aborting." + quit() + + #Get the conformer energies from the file + ETable = [] + for line in inp[13:]: + data = line.split(' ') + data = filter(None, data) + if len(data) >= 3: + if 'Map' in data[0] and 'Minimum' in data[1]: + ETable.append(data[-2:]) + #print data + + infile.close() + if settings.charge is None: + if os.path.exists(TinkerOutput+'.inchi'): + return ETable, GetInchiCharge(TinkerOutput) + else: + return ETable, GetSDFCharge(TinkerOutput) + else: + return ETable, settings.charge + + +#translate Tinker atom types to element symbols for NWChem file +def GetTinkerSymbol(atomType): + + Lookup = ['C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', ' ', 'C', + 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', 'C', + 'C', 'C', 'H', 'H', 'O', 'O', 'O', 'O', 'O', 'O', + 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', + 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'N', 'N', + 'N', 'N', 'N', 'N', 'N', 'F', 'Cl', 'Br', 'I', 'S', + 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', + 'Si', 'C', 'H', 'H', 'H', 'C', 'H', 'H', 'H', 'H', + 'H', 'H', 'H', 'H', 'P', 'P', 'P', 'P', 'P', 'P', + 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', + 'H', 'H', 'H', 'H', 'C', 'H', 'O', 'O', 'O', 'O', + 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', + 'O', 'H', 'N', 'O', 'O', 'H', 'H', 'H', 'H', 'H', + 'H', 'H', 'C', 'N', 'N', 'N', 'N', 'N', 'N', 'C', + 'C', 'N', 'N', 'N', 'N', 'N', 'N', 'S', 'N', 'N', + 'N', 'N', 'N', 'O', 'H', 'O', 'H', 'N', 'N', 'N', + 'N', 'N', 'C', 'C', 'N', 'O', 'C', 'N', 'N', 'C', + 'C', 'N', 'N', 'N', 'N', 'N', 'O', 'H', 'H', 'H', + 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'P', 'N', 'Cl', + 'C', 'N', 'C', 'N', 'N', 'N', 'N', 'N', 'N', 'N', + 'Fe', 'Fe', 'F', 'Cl', 'Br', 'Li', 'Na', 'K', 'Zn', 'Zn', + 'Ca', 'Cu', 'Cu', 'Mg'] + + if Lookup[atomType-1] == ' ': + print 'Unknown atom type' + + return Lookup[atomType-1] + + +def GetInchiCharge(inchifile): + + infile = open(inchifile + '.inchi', 'r') + inp = infile.readlines() + infile.close() + + ChargeFound = False + #Get inchi layers + layers = inp[0].split('/') + for l in layers[1:]: + if 'q' in l: + charge = int(l[1:]) + ChargeFound = True + break + if 'p' in l: + charge = int(l[1:]) + ChargeFound = True + break + if not ChargeFound: + charge = 0 + + return charge + + +def GetSDFCharge(sdf): + import sys + sys.path.append('/home/ke291/Tools/openbabel-install/lib/python2.7/site-packages/') + import openbabel + + obconversion = openbabel.OBConversion() + obconversion.SetInFormat("sdf") + obmol = openbabel.OBMol() + obconversion.ReadFile(obmol, sdf+'.sdf') + + return obmol.GetTotalCharge() + + +def getScriptPath(): + return os.path.dirname(os.path.realpath(sys.argv[0])) diff --git a/TreeRenum.py b/TreeRenum.py new file mode 100755 index 0000000..0eecdde --- /dev/null +++ b/TreeRenum.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Created on Tue Mar 31 14:54:17 2015 + +@author: ke291 +""" +import sys +sys.path.append('/home/ke291/Tools/openbabel-install/lib/python2.7/site-packages/') +from openbabel import * + + +def FindAllPaths(molgraph, start, end, path=[]): + path = path + [start] + if start == end: + return [path] + if start not in [x[0] for x in molgraph]: + print "No such node in graph" + return [] + paths = [] + for node in molgraph[start-1][1:]: + if node not in path: + newpaths = FindAllPaths(molgraph, node, end, path) + for newpath in newpaths: + paths.append(newpath) + return paths + + +def FindTerminatingPaths(molgraph, start, trunk, path=[]): + path = path + [start] + if start not in [x[0] for x in molgraph]: + print "No such node in graph" + return [] + paths = [] + for node in molgraph[start-1][1:]: + if node not in path and node not in trunk: + newpaths = FindTerminatingPaths(molgraph, node, trunk, path) + for newpath in newpaths: + paths.append(newpath) + if paths == []: + return [path] + else: + return paths + + +def FindTreeMap(f): + #get molecular graph + molgraph = GenMolGraph(f) + + #Find potential endpoints - atoms with only one neighbour + endpoints = [] + for node in molgraph: + if len(node) < 3: + endpoints.append(node[0]) + + #get the longest paths for all endpoint combinations + maxpaths = [] + for i in range(0, len(endpoints)): + for j in range(0, len(endpoints)): + if i != j: + maxpaths.append(max(FindAllPaths(molgraph, endpoints[i], + endpoints[j]), key=len)) + #get the absolute longest path possible in the molecule + molmap = max(maxpaths, key=len) + #Add longest branches to the longest trunk + for atom in molmap: + for node in molgraph[atom-1]: + if node not in molmap: + maxbranch = [] + branches = FindTerminatingPaths(molgraph, node, molmap) + if branches != []: + maxbranch = max(branches, key=len) + if maxbranch != []: + molmap.extend(maxbranch) + return molmap + + +def TreeRenumSDF(f, ExpNMR): + + molmap = FindTreeMap(f) + + #Read molecule from file + obconversion = OBConversion() + obconversion.SetInFormat("sdf") + obmol = OBMol() + obconversion.ReadFile(obmol, f) + + temp = [] + anums = [] + for atom in OBMolAtomIter(obmol): + temp.append(atom) + anums.append(atom.GetAtomicNum()) + + newmol = OBMol() + for atom in molmap: + newmol.AddAtom(temp[atom-1]) + + #Restore the bonds + newmol.ConnectTheDots() + newmol.PerceiveBondOrders() + #Write renumbered molecule to file + obconversion.SetOutFormat("sdf") + obconversion.WriteFile(newmol, f[:-4] + 'r.sdf') + + #Prepare NMR translation + NMRmap = [] + i = 1 + for atom in molmap: + anum = anums[atom-1] + if anum == 1: + NMRmap.append(['H' + str(atom), 'H' + str(i)]) + if anum == 6: + NMRmap.append(['C' + str(atom), 'C' + str(i)]) + i += 1 + print NMRmap + RenumNMR(ExpNMR, NMRmap) + + +def RenumNMR(ExpNMR, NMRmap): + f = open(ExpNMR, 'r') + NMRfile = f.read(1000000) + f.close() + + print '\nOld NMR file:\n' + NMRfile + + #Replace old atom labels with new atom labels + #tag replacements with '_' to avoid double replacement + for atom in NMRmap: + NMRfile = NMRfile.replace(atom[0] + ')', atom[1] + '_)') + NMRfile = NMRfile.replace(atom[0] + ' ', atom[1] + '_ ') + NMRfile = NMRfile.replace(atom[0] + ',', atom[1] + '_,') + NMRfile = NMRfile.replace(atom[0] + '\n', atom[1] + '_\n') + + #Strip temporary udnerscore tags + NMRfile = NMRfile.replace('_', '') + + print '\nNew NMR file:\n' + NMRfile + f = open(ExpNMR + 'r', 'w') + f.write(NMRfile) + f.close() + + +def GenMolGraph(f): + obconversion = OBConversion() + obconversion.SetInFormat("sdf") + obmol = OBMol() + obconversion.ReadFile(obmol, f) + + molgraph = [] + + for atom in OBMolAtomIter(obmol): + idx = atom.GetIdx() + molgraph.append([]) + molgraph[idx-1].append(idx) + + for NbrAtom in OBAtomAtomIter(atom): + molgraph[idx-1].append(NbrAtom.GetIdx()) + + return molgraph + +if __name__ == '__main__': + #print sys.argv + ExpNMR = sys.argv[2] + filename = sys.argv[1] + '.sdf' + TreeRenumSDF(filename, ExpNMR) + #main() \ No newline at end of file diff --git a/default.com b/default.com new file mode 100644 index 0000000..a8e1a29 --- /dev/null +++ b/default.com @@ -0,0 +1,18 @@ +confsearch.mae +confsearch-out.maegz + MMOD 0 1 0 0 0.0000 0.0000 0.0000 0.0000 + FFLD 10 1 0 0 1.0000 0.0000 0.0000 0.0000 + BDCO 0 0 0 0 41.5692 99999.0000 0.0000 0.0000 + READ 0 0 0 0 0.0000 0.0000 0.0000 0.0000 + CRMS 0 0 0 0 0.0000 0.2500 0.0000 0.0000 + LMCS 5000 0 0 0 0.0000 0.0000 3.0000 6.0000 + NANT 0 0 0 0 0.0000 0.0000 0.0000 0.0000 + MCNV 0 0 0 0 0.0000 0.0000 0.0000 0.0000 + MCSS 2 0 0 0 21.0000 0.0000 0.0000 0.0000 + MCOP 1 0 0 0 0.5000 0.0000 0.0000 0.0000 + DEMX 0 333333 0 0 21.0000 42.0000 0.0000 0.0000 + MSYM 0 0 0 0 0.0000 0.0000 0.0000 0.0000 + AUOP 0 0 0 0 500.0000 0.0000 0.0000 0.0000 + AUTO 0 2 1 1 0.0000 -1.0000 0.0000 2.0000 + CONV 2 0 0 0 0.0010 0.0000 0.0000 0.0000 + MINI 1 0 999999 0 0.0000 0.0000 0.0000 0.0000 diff --git a/default.key b/default.key new file mode 100644 index 0000000..2feb51d --- /dev/null +++ b/default.key @@ -0,0 +1,11 @@ + +# Force Field Selection +PARAMETERS /home/ke291/tinker/params/mmff.prm + +# Random Number +RANDOMSEED 123456789 + +# Constriant And Restraint +ENFORCE-CHIRALITY + + diff --git a/default1.com b/default1.com new file mode 100644 index 0000000..9468638 --- /dev/null +++ b/default1.com @@ -0,0 +1,17 @@ +default1.mae +default1-out.maegz + MMOD 0 1 0 0 0.0000 0.0000 0.0000 0.0000 + FFLD 10 1 0 0 1.0000 0.0000 0.0000 0.0000 + BDCO 0 0 0 0 41.5692 99999.0000 0.0000 0.0000 + READ 0 0 0 0 0.0000 0.0000 0.0000 0.0000 + CRMS 0 0 0 0 0.0000 0.2500 0.0000 0.0000 + LMCS 6000 0 0 0 0.0000 0.0000 3.0000 6.0000 + MCNV 0 0 0 0 0.0000 0.0000 0.0000 0.0000 + MCSS 2 0 0 0 21.0000 0.0000 0.0000 0.0000 + MCOP 1 0 0 0 0.5000 0.0000 0.0000 0.0000 + DEMX 0 333333 0 0 21.0000 42.0000 0.0000 0.0000 + MSYM 0 0 0 0 0.0000 0.0000 0.0000 0.0000 + AUOP 0 0 0 0 300.0000 0.0000 0.0000 0.0000 + AUTO 0 2 1 1 0.0000 -1.0000 0.0000 3.0000 + CONV 2 0 0 0 0.0010 0.0000 0.0000 0.0000 + MINI 1 0 999999 0 0.0000 0.0000 0.0000 0.0000 diff --git a/nmrPredictGaussian.class b/nmrPredictGaussian.class new file mode 100644 index 0000000..197792a Binary files /dev/null and b/nmrPredictGaussian.class differ diff --git a/nmrPredictGaussian.java b/nmrPredictGaussian.java new file mode 100644 index 0000000..1d7da3e --- /dev/null +++ b/nmrPredictGaussian.java @@ -0,0 +1,208 @@ +import java.io.*; +import java.util.*; +import java.math.*; + + +/* +This program reads Gaussian output files and pulls out the NMR shielding constants for each atom in each conformer. It also extracts energies and calculates a Boltzmann-averaged shielding constant for each atom. + +Input syntax is the names of the output files without the .out: + +java nmrPredictGaussian myMoleculeCnf001 myMoleculeCnf002 myMoleculeCnf003 .... + +(or java nmrPredictGaussian `ls myMoleculeCnf*.out | cut -d"." -f1`) + +The program prints the output as a comma separated list suitable for pasting into a spreadsheet. + +*/ + + +class nmrPredictGaussian +{ + + static double gasConstant = 8.3145; + static double temperature = 298.15; + static double hartreeEnergy = 2625.499629554010; + + + public static void main(String[] args) + { + int nstructs = args.length; + int natoms = 0; + + String[] fileName = new String[nstructs]; + for(int i=0;i