-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathparse_data.py
284 lines (243 loc) · 10.4 KB
/
parse_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
# -*- coding: UTF-8 -*-
import os, sys
import numpy as np
import cclib
from abc import ABC, abstractmethod
from dbstep.constants import BOHR_TO_ANG, periodic_table
"""
parse_data
Parses data from files
Currently supporting:
.cube Gaussian volumetric files
all filetypes parsed by the cclib python package (see https://cclib.github.io/)
"""
def read_input(molecule, ext, options):
"""Chooses a Parser based on input molecule format.
Args:
molecule (str or mol object): path to file if molecule represented as one, or RDKit mol object
ext (str): file extension used
options (dict): options for DBSTEP program
Returns:
DataParser object with parsed molecule data to be used by the rest of the program
"""
if ext == '.cube':
options.surface = 'density'
mol = CubeParser(molecule, "cube")
else:
if ext in [".xyz", '.com', '.gjf']:
mol = XYZParser(molecule, ext[1:], options.noH, options.exclude, options.spec_atom_1, options.spec_atom_2)
elif ext == 'rdkit':
mol = RDKitParser(molecule, options.noH, options.exclude, options.spec_atom_1, options.spec_atom_2)
else:
mol = cclibParser(molecule, ext[1:], options.noH, options.exclude, options.spec_atom_1, options.spec_atom_2)
if options.noH or options.exclude:
options.spec_atom_1 = mol.spec_atom_1
options.spec_atom_2 = mol.spec_atom_2
return mol
class DataParser(ABC):
"""Abstract base class made to be inherited by parsers for different molecule formats.
Attributes:
_input (str or RDKit mol object): the input molecule
FORMAT (str): format of the input molecule
ATOMTYPES (numpy array of char): the atoms in the molecule, starts as a list
CARTESIANS (numpy array of tuples): xyz coordinates for each atom in the molecule, starts as a list
noH (bool): true if hydrogens should be removed false otherwise.
exclude (str): atoms to exclude from steric measurements (1-indexed)
spec_atom_1 (int): specifies atom1
spec_atom_2 (list of int): specifies atom2(s)
file_lines (list of str, optional): each line of the file
"""
def __init__(self, _input, input_format, noH=False, exclude=False, spec_atom_1=None, spec_atom_2=None, manual_file_lines=False):
"""Initializes basic member variables and lays out the ordering of method calls.
Args:
_input (str or RDKit mol object): the input molecule
input_format (str): input_format of the input molecule
noH (bool, optional): boolean which specifies whether hydrogens should be removed
exclude (str, optional): string listing atom indices to remove. 1-indexed.
spec_atom_1 (int, optional): specifies atom1
spec_atom_2 (list of int, optional): contains atom2(s)
manual_file_lines (bool, optional): to parse _input line by line manually using get_file_lines or not
"""
self._input, self.FORMAT = _input, input_format
self.ATOMTYPES, self.CARTESIANS = [], []
self.noH = noH
self.exclude = exclude
self.spec_atom_1, self.spec_atom_2 = spec_atom_1, spec_atom_2
if manual_file_lines:
self.file_lines = DataParser.get_file_lines(_input)
self.parse_input()
self.ATOMTYPES, self.CARTESIANS = np.array(self.ATOMTYPES), np.array(self.CARTESIANS)
if (self.noH or self.exclude) and self.FORMAT != 'cube':
self.exclude_atoms()
@abstractmethod
def parse_input(self):
"""Parse the input, filling ATOMTYPES with the atoms of the input molecule and CARTESIANS with the atoms xyz coordinates.
"""
pass
def exclude_atoms(self):
"""Remove requested atoms - hydrogens or manually specified atoms"""
atoms_to_remove = [False for i in range(len(self.ATOMTYPES))]
if self.noH:
atoms_to_remove = [
True if self.ATOMTYPES[i] == 'H'
else atoms_to_remove[i]
for i in range(len(atoms_to_remove))]
if self.exclude:
del_atom_list = [int(atom)-1 for atom in self.exclude.split(',')]
atoms_to_remove = [
True if i in del_atom_list
else atoms_to_remove[i]
for i in range(len(atoms_to_remove))]
spec_atoms = [self.spec_atom_1-1]
[spec_atoms.append(atom-1) for atom in self.spec_atom_2]
#if removed atom is one of the spec atoms, replace its atom type with Bq (radii=0)
self.ATOMTYPES = np.array([
'Bq' if i in spec_atoms and atoms_to_remove[i]
else self.ATOMTYPES[i]
for i in range(len(atoms_to_remove))])
atoms_to_remove = [
False if i in spec_atoms and atoms_to_remove[i]
else atoms_to_remove[i]
for i in range(len(atoms_to_remove))]
spec_atoms = [
spec_atom - np.count_nonzero(atoms_to_remove[:spec_atom])
for spec_atom in spec_atoms]
self.spec_atom_1 = spec_atoms[0] + 1
self.spec_atom_2 = [atom+1 for atom in spec_atoms[1:]]
self.ATOMTYPES = self.ATOMTYPES[np.invert(atoms_to_remove)]
self.CARTESIANS = self.CARTESIANS[np.invert(atoms_to_remove)]
@staticmethod
def get_file_lines(file):
""""Reads file and returns the lines using readlines()
Args:
file (str): the path to the file
Returns:
list with lines of the file
"""
with open(file, 'r') as f:
return f.readlines()
class CubeParser(DataParser):
"""Read data from cube file, obtian XYZ Cartesians, dimensions, and volumetric data."""
def __init__(self, file, input_format):
super().__init__(file, input_format, manual_file_lines=True)
self.INCREMENTS = np.asarray([self.x_inc, self.y_inc, self.z_inc])
self.DENSITY = np.asarray(self.DENSITY)
self.DATA = np.reshape(self.DENSITY, (self.xdim, self.ydim, self.zdim))
def parse_input(self):
"""Parses input from a cube file.
http://paulbourke.net/dataformats/cube/ was used to determine general format of a cube file.
"""
self.num_atoms = None
self.ATOMNUM, self.DENSITY, self.DENSITY_LINE = [], [], []
file_lines = self.file_lines
start_of_atoms = 6
# first two lines skipped as they do not have useful information for this program
for i in range(2, len(file_lines)):
try:
curr_line = file_lines[i]
coord = [float(c) for c in curr_line.split()]
if i == 2:
self.num_atoms = coord[0]
self.ORIGIN = [coord[1]*BOHR_TO_ANG, coord[2]*BOHR_TO_ANG, coord[3]*BOHR_TO_ANG]
elif i == 3:
self.xdim = int(coord[0])
self.SPACING = coord[1]*BOHR_TO_ANG
self.x_inc = [coord[1]*BOHR_TO_ANG, coord[2]*BOHR_TO_ANG, coord[3]*BOHR_TO_ANG]
elif i == 4:
self.ydim = int(coord[0])
self.y_inc = [coord[1]*BOHR_TO_ANG, coord[2]*BOHR_TO_ANG, coord[3]*BOHR_TO_ANG]
elif i == 5:
self.zdim = int(coord[0])
self.z_inc = [coord[1]*BOHR_TO_ANG, coord[2]*BOHR_TO_ANG,coord[3]*BOHR_TO_ANG]
elif self.num_atoms and start_of_atoms <= i < start_of_atoms + self.num_atoms:
self._parse_atom_line(coord)
else:
self._parse_density_line(coord, curr_line)
except ValueError as e:
# TODO: make a custom cube file exception to chain ValueError with this error
# TODO: handle potentially invalid atom errors
sys.exit(f' Unable to parse \"{self._input}\", a value on line {i + 1} could not be read in.')
def _parse_atom_line(self, split_line):
"""Parses a line in the cube file containing atom number and coordinates."""
atom_num = int(split_line[0])
atom = periodic_table[atom_num]
x, y, z = float(split_line[2]) * BOHR_TO_ANG, float(split_line[3]) * BOHR_TO_ANG, float(split_line[4]) * BOHR_TO_ANG
self.ATOMNUM.append(atom_num)
self.ATOMTYPES.append(atom)
self.CARTESIANS.append([x, y, z])
def _parse_density_line(self, split_line, curr_line):
"""Appends density values from a line in the cube file to the DENSITY member array."""
for val in split_line:
self.DENSITY.append(float(val))
self.DENSITY_LINE.append(curr_line)
class XYZParser(DataParser):
"""Read XYZ Cartesians from an xyz file or chem files similar to xyz."""
def __init__(self, file, input_format, noH, exclude, spec_atom_1, spec_atom_2):
super().__init__(file, input_format, noH, exclude, spec_atom_1, spec_atom_2, manual_file_lines=True)
def parse_input(self):
"""Parses input from either xyz file or com/gif file."""
file_lines = self.file_lines
if self.FORMAT == 'xyz':
for i in range(0,len(file_lines)):
try:
coord = file_lines[i].split()
for i in range(len(coord)):
try:
coord[i] = float(coord[i])
except ValueError: pass
if len(coord) == 4:
if isinstance(coord[1], float) and isinstance(coord[2], float) and isinstance(coord[3], float):
[atom, x,y,z] = [coord[0], coord[1], coord[2], coord[3]]
self.ATOMTYPES.append(atom)
self.CARTESIANS.append([x,y,z])
except: pass
elif self.FORMAT == 'com' or self.FORMAT == 'gjf':
for i in range(0,len(file_lines)):
if file_lines[i].find("#") > -1:
if len(file_lines[i+1].split()) == 0:
start = i+5
if len(file_lines[i+2].split()) == 0:
start = i+6
break
for i in range(start, len(file_lines)):
try:
coord = file_lines[i].split()
for i in range(len(coord)):
try:
coord[i] = float(coord[i])
except ValueError: pass
if len(coord) == 4:
if isinstance(coord[1], float) and isinstance(coord[2], float) and isinstance(coord[3],float):
[atom, x,y,z] = [coord[0], coord[1], coord[2], coord[3]]
self.ATOMTYPES.append(atom)
self.CARTESIANS.append([x,y,z])
except: pass
class cclibParser(DataParser):
"""Use the cclib package to extract data from generic computational chemistry output files."""
def __init__(self, file, input_format, noH, exclude, spec_atom_1, spec_atom_2):
super().__init__(file, input_format, noH, exclude, spec_atom_1, spec_atom_2)
def parse_input(self):
"""Parses input file uses cclib file parser."""
cclib_parsed = cclib.io.ccread(self._input)
self.CARTESIANS = np.array(cclib_parsed.atomcoords[-1])
for i in cclib_parsed.atomnos:
self.ATOMTYPES.append(periodic_table[i])
class RDKitParser(DataParser):
"""Extract coordinates and atom types from rdkit mol object
Attributes:
ATOMTYPES (numpy array): List of elements present in molecular file
CARTESIANS (numpy array): List of Cartesian (x,y,z) coordinates for each atom
"""
def __init__(self, mol, noH, exclude, spec_atom_1, spec_atom_2):
super().__init__(mol, 'RDKit', noH, exclude, spec_atom_1, spec_atom_2)
def parse_input(self):
"""Store cartesians and symbols from mol object"""
try:
self.ATOMTYPES, self.CARTESIANS = [], []
for i in range(self._input.GetNumAtoms()):
self.ATOMTYPES.append(self._input.GetAtoms()[i].GetSymbol())
pos = self._input.GetConformer().GetAtomPosition(i)
self.CARTESIANS.append([pos.x, pos.y, pos.z])
except ValueError:
self.ATOMTYPES, self.CARTESIANS = [], []
print("Mol object does not have 3D coordinates!")