useful(?) scripts that use rdkit

dkoes · Oct 27, 2016 · b580b64 · b580b64
1 parent bb9b91e
commit b580b64
Show file tree

Hide file tree

Showing 11 changed files with 1,326 additions and 0 deletions.
diff --git a/applysdfcoordstopdb.py b/applysdfcoordstopdb.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+'''Given a pdb file and an sdf file of the same ligand, find the correspondance
+between the atoms and output a pdb with the same atom names as the pdb input
+but with the coordinates of the sdf file. Atoms missing in the sdf 
+will not have their coordinates changed, but note that hydrogens are stripped'''
+
+import sys
+from rdkit.Chem import AllChem
+
+if len(sys.argv) != 3:
+    print "Need sdf and pdb files"
+    sys.exit(1)
+
+sdf = sys.argv[1]
+pdb = sys.argv[2]
+
+#figure out starting atom number
+startnum = 1
+for line in open(pdb):
+    if line.startswith('ATOM') or line.startswith('HETATM'):
+	startnum = int(line[6:11])
+	break
+
+pmol = AllChem.MolFromPDBFile(pdb)
+smol = AllChem.SDMolSupplier("newsnappy.sdf").next()
+pmol = AllChem.AssignBondOrdersFromTemplate(smol,pmol)
+
+if smol.HasSubstructMatch(pmol):
+    m = smol.GetSubstructMatch(pmol)
+    pconf = pmol.GetConformer()
+    sconf = smol.GetConformer()
+    for (pi, si) in enumerate(m):
+	pconf.SetAtomPosition(pi, sconf.GetAtomPosition(si))
+    pdbout = AllChem.MolToPDBBlock(pmol, flavor=2).split('\n')
+    num = startnum
+    for line in pdbout:
+	if line.startswith('ATOM') or line.startswith('HETATM'):
+	    print '%s%5d%s' % (line[:6],num,line[11:])
+	    num += 1
+else:
+    print "Could not matchup pdb and sdf"
+    sys.exit(1)
diff --git a/clustermatches.py b/clustermatches.py
@@ -0,0 +1,177 @@
+#!/usr/bin/python
+
+#given a smarts rxn file, a core scaffold smarts file (with name containing connecting atoms)
+# and an sdf file, extract the matching scaffolds from of the sdf file and cluster
+#them greedily to identify a set of scaffold conformations
+
+import sys,gzip,argparse
+from rdkit.Chem import AllChem
+
+def subMol(mol, match):
+	#not sure why this functionality isn't implemented natively
+	#but get the interconnected bonds for the match
+	atoms = set(match)
+	bonds = set()
+	for a in atoms:
+		atom = mol.GetAtomWithIdx(a)
+		for b in atom.GetBonds():
+			if b.GetOtherAtomIdx(a) in atoms:
+				bonds.add(b.GetIdx())
+	return AllChem.PathToSubmol(mol,list(bonds))
+
+#compute the distances between the matching connecting atoms 
+#and return true if all distance are small enough
+def checkConnect(center, cmatch, mol, match, connectIndices, connect):
+	cconf = center.GetConformer(0)
+	mconf = mol.GetConformer(0)
+	for i in connectIndices:
+		cidx = cmatch[i]
+		midx = match[i]
+		cpt = cconf.GetAtomPosition(cidx)
+		mpt = mconf.GetAtomPosition(midx)
+		dist = cpt.Distance(mpt)
+		if dist > connect:
+			return False
+	return True
+
+#find all the scaffolds in mols that are within rmsd of center and where the connecting
+#atoms are within connect, return new cluster and new mols
+def createCluster(center,cmatch, mols, pattern, core, rmsd, connectIndices, connect):
+	cluster = list()
+	newmols = list()
+	for (mol,match) in mols:
+		r = AllChem.GetBestRMS(mol,center,maps=[zip(cmatch,match)])
+		if r < rmsd and checkConnect(center, cmatch, mol,match,connectIndices, connect):
+			cluster.append((mol,match,r))
+		else:
+			newmols.append((mol,match))
+	cluster.sort(key = lambda (m,mtch,r): r )
+	return (cluster, newmols)
+
+#find the mol in mols that has the maximum minimum distance between the first
+#mol in each cluster
+#ACTUALLY, for the tight tolerances we need, this really doesn't make a difference
+#and just slows things down, so just pick the first available conformer
+def computeNext(clusters,mols):
+	if len(mols) > 0:
+		return mols[0]
+	else:
+		return (None,None)
+	max = 0
+	best = (None,None)
+	for (mol,match) in mols:
+		min = float('inf')
+		for cl in clusters:
+			cmol = cl[0][0]
+			cmatch = cl[0][1]
+			r = AllChem.GetBestRMS(cmol,mol,maps=[zip(match,cmatch)])
+			if r < min:
+				min = r
+		if min > max:
+			max = min
+			best = (mol,match)
+	return best
+
+#MAIN
+if len(sys.argv) < 5:
+	print "Need reaction file, core scaffold file, sdf input file and sdf output"
+	sys.exit(1)
+
+parser = argparse.ArgumentParser()
+parser.add_argument('-r','--rxn', help="Reaction file")
+parser.add_argument('-c','--core',help="Core scaffold with connecting atoms in name")
+parser.add_argument('-i','--input',help="Input conformers")
+parser.add_argument('-o','--output',help="Clustered core scaffold output")
+parser.add_argument("--rmsd",type=float,default=0.5,help="Maximum RMSD for cluster membership")
+parser.add_argument("--connect",type=float,default=0.1,help="Maximum allowed deviation of connecting atoms for cluster membership")
+parser.add_argument("--sample",type=int,default=1,help="Amount to sample conformations")
+args = parser.parse_args()
+
+rxnf = open(args.rxn)
+rxnsm = rxnf.readline().split()[0] #ignore any name
+rxn = AllChem.ReactionFromSmarts(rxnsm)
+rxn.Initialize()
+
+if rxn.GetNumProductTemplates() == 1:
+	product = rxn.GetProductTemplate(0)
+	reactants = list()
+	for i in xrange(rxn.GetNumReactantTemplates()):
+		reactants.append(rxn.GetReactantTemplate(i))
+elif rxn.GetNumReactantTemplates() == 1:
+	product = rxn.GetReactantTemplate(0)
+	reactants = list()
+	for i in xrange(rxn.GetNumProductTemplates()):
+		reactants.append(rxn.GetProductTemplate(i))
+else:
+	print "Can have only one product"
+	sys.exit(1)
+
+coref = open(args.core)
+corel = coref.readline()
+coreconnects = corel.split()[1:]
+core = AllChem.MolFromSmarts(corel.split()[0])
+
+inmols = AllChem.SDMolSupplier(args.input)
+if inmols is None:
+	print "Could not open ",args.input
+	sys.exit(-1)
+
+sdwriter = AllChem.SDWriter(args.output)
+if sdwriter is None:
+	print "Could not open ",args.output
+	sys.exit(-1)
+
+smart = AllChem.MolToSmarts(product)
+pattern = AllChem.MolFromSmarts(smart)
+
+#figure out the indices of connected atoms in the smart core pattern
+connectIndices = list()
+for c in coreconnects:
+	cm = AllChem.MolFromSmarts(c)
+	a = cm.GetAtoms()[0]
+	if a.HasProp('molAtomMapNumber'):
+		mapnum = a.GetProp('molAtomMapNumber')
+		for sma in core.GetAtoms(): 
+			if sma.HasProp('molAtomMapNumber') and sma.GetProp('molAtomMapNumber') == mapnum:
+				connectIndices.append(sma.GetIdx())
+
+#read all core scaffold molecules into memory
+mols = list()
+cnt = 0
+for mol in inmols:
+	if cnt % args.sample == 0 and mol is not None:
+		try:
+			mol = AllChem.AddHs(mol)
+			match = mol.GetSubstructMatch(pattern) #just one? why not, we're only sampling
+			if match:
+				sub = subMol(mol, match)
+				cmatch = sub.GetSubstructMatch(core)
+				if cmatch:
+					sub = subMol(sub,cmatch)
+					mols.append((sub,sub.GetSubstructMatch(core)))
+		except (KeyboardInterrupt, SystemExit):
+			raise		
+		except Exception as e:
+			print "Exception occurred",mol.GetProp('_Name'),e
+	cnt += 1
+
+if len(mols) == 0:
+	print "No molecules!"
+	sys.exit(-1)
+print "Done reading"	
+clusters = list() #these are just defined by a list of all the scffolds assigned to the cluster
+(center, cmatch) = mols[0]
+
+while len(mols) > 0:
+	(cluster, mols) = createCluster(center,cmatch,mols, pattern, core, args.rmsd, connectIndices, args.connect)
+	clusters.append(cluster)
+	(center, cmatch) = computeNext(clusters,mols)
+
+print len(clusters)
+for cl in clusters:
+	cmol = cl[0][0]
+	cmol.SetProp("ClusterSize",str(len(cl)))
+	AllChem.GetBestRMS(clusters[0][0][0],cmol) #align to very first 
+	sdwriter.write(cmol)
+sdwriter.close()
+
diff --git a/corerxnscaffold.py b/corerxnscaffold.py
@@ -0,0 +1,110 @@
+#!/usr/bin/python
+
+#given a rxn smarts (either forward or backwards, but there is assumed to be
+#only one product) identify a minimal core scaffold that has every connecting atom
+#from each reactant and only those scaffold atoms that are necessary to minimally
+#connect these atoms
+#this assume the input smarts is fully annoated with atom mappings for the heavy atoms
+#output the smarts (with atom numbers) of the core scaffold
+
+import sys
+from rdkit.Chem import AllChem
+import igraph
+
+Debug = False
+
+if len(sys.argv) < 1:
+	print "Need reaction file"
+	sys.exit(1)
+if len(sys.argv) > 2:
+	Debug = True
+
+rxnf = open(sys.argv[1])
+rxnsm = rxnf.readline()
+rxn = AllChem.ReactionFromSmarts(rxnsm)
+rxn.Initialize()
+
+if rxn.GetNumProductTemplates() == 1:
+	product = rxn.GetProductTemplate(0)
+	reactants = list()
+	for i in xrange(rxn.GetNumReactantTemplates()):
+		reactants.append(rxn.GetReactantTemplate(i))
+elif rxn.GetNumReactantTemplates() == 1:
+	product = rxn.GetReactantTemplate(0)
+	reactants = list()
+	for i in xrange(rxn.GetNumProductTemplates()):
+		reactants.append(rxn.GetProductTemplate(i))
+else:
+	print "Can have only one product"
+	sys.exit(1)
+
+#record what atom mapping number belong to each reactant
+reactantMapNumbers = dict()
+for (i, react) in enumerate(reactants):
+	for a in react.GetAtoms():
+		if a.HasProp('molAtomMapNumber'):
+			mapnum = a.GetProp('molAtomMapNumber')
+			reactantMapNumbers[mapnum] = i
+
+#now create an igraph from the product
+molgraph = igraph.Graph()
+molgraph.add_vertices(product.GetNumAtoms())
+
+for a in product.GetAtoms():
+	s = a.GetIdx()
+	v = molgraph.vs[s]
+	v['react'] = -1 #reactant this atom belongs to, -1 if none
+	v['heavy'] = a.GetAtomicNum() != 1 #note this things [#1,#6] is H only, but that should be okay - if H can go there, no need to worry about scaffold connectivity 
+	if a.HasProp('molAtomMapNumber'):
+		mapnum = a.GetProp('molAtomMapNumber')
+		if mapnum in reactantMapNumbers:
+			v['react'] = reactantMapNumbers[mapnum]
+	for na in a.GetNeighbors():
+		t = na.GetIdx()
+		molgraph.add_edge(s,t)
+
+#now identify the connecting heavy atoms for reactants
+connecting = list()
+for v in molgraph.vs:
+	if v['react'] >= 0 and v['heavy']:
+		r = v['react']
+		for nv in v.neighbors():
+			if nv['react'] != r and nv['heavy']:
+				connecting.append(v.index)
+				break
+
+if Debug:
+	for i in connecting:
+		print product.GetAtomWithIdx(i).GetSmarts(),molgraph.vs[i]['react']
+
+#compute the shortest paths between all these connecting atoms
+#the atoms along these paths make the minimal scaffold
+scaffold = set()
+for i in connecting:
+	paths = molgraph.get_all_shortest_paths(i,connecting,mode=igraph.ALL)
+	for p in paths:
+		for v in p:
+			scaffold.add(v)
+
+if Debug:
+	print "\nScaffold"
+	for i in scaffold:
+		print product.GetAtomWithIdx(i).GetSmarts(),molgraph.vs[i]['react']
+
+#need to identify the bonds between the scaffold atoms
+bonds = set()
+for i in scaffold:
+	a = product.GetAtomWithIdx(i)
+	for b in a.GetBonds():
+		end = b.GetOtherAtomIdx(i)
+		if end in scaffold:
+			bonds.add(b.GetIdx())
+
+if Debug:
+	print "Bonds",list(bonds)
+
+subMol = AllChem.PathToSubmol(product,list(bonds))
+print AllChem.MolToSmarts(subMol),
+for i in connecting:
+	print product.GetAtomWithIdx(i).GetSmarts(),
+print ""