louiskhub
/
Deep-clustering-of-small-molecules-at-large-scale-via-variational-autoencoder-embedding-and-K-means
Public
forked from HamidHadipour/Deep-clustering-of-small-molecules-at-large-scale-via-variational-autoencoder-embedding-and-K-means
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathglobal_feature_generation.py
60 lines (47 loc) · 1.96 KB
/
global_feature_generation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import numpy as np
import pandas as pd
from rdkit import Chem
from mordred import Calculator, descriptors
from typing import List, Dict, Any
import concurrent.futures
def get_data():
global SELECTED_DESCRIPTOR_NAMES
global SMILES_OF_ACTIVE_COMPOUNDS
# load selected mordered descriptors
with open("data/mordred_descriptors.txt", "r") as descriptor_file:
SELECTED_DESCRIPTOR_NAMES = descriptor_file.read().splitlines()
# load compounds that were found active against HC by dl_mlp_class_v1_4
active_compound_df: pd.DataFrame = pd.read_csv(
filepath_or_buffer="data/zinc_15_m1002a_active.csv",
names=[ # col-names somehow got lost - restored them from dl_mlp_class_v1_4.py l.474
"id",
"infile_smiles",
"infile_property",
"decoded_infile_property",
"predicted_infile_property",
],
)
SMILES_OF_ACTIVE_COMPOUNDS = active_compound_df.decoded_infile_property.values
def insert_compound_descriptors(enum):
i, smiles = enum
if i % 500 == 0:
print(i)
all_molecule_descriptors: Dict[str, Any] = calc(Chem.MolFromSmiles(smiles))
selected_molecule_descriptors: np.ndarray = np.array(
[all_molecule_descriptors[d] for d in SELECTED_DESCRIPTOR_NAMES],
dtype=np.float32
)
# insert selected descriptors in our template
return selected_molecule_descriptors
if __name__ == "__main__":
get_data()
calc = Calculator(descriptors, ignore_3D=True)
with concurrent.futures.ProcessPoolExecutor() as executor:
descriptors_of_all_molecules = executor.map(insert_compound_descriptors, enumerate(SMILES_OF_ACTIVE_COMPOUNDS))
df_for_saving = pd.DataFrame(
data=list(descriptors_of_all_molecules),
index=SMILES_OF_ACTIVE_COMPOUNDS,
columns=SELECTED_DESCRIPTOR_NAMES,
).reset_index()
df_for_saving.to_feather("data/molecule_descriptors.feather")
print("\nDone!")