-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
73 additions
and
53 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,74 +1,94 @@ | ||
import deepchem as dc | ||
""" | ||
This module generates noisy data. | ||
Data is either generated analytically or from the QM9 dataset. | ||
""" | ||
# standard imports | ||
import numpy as np | ||
import random | ||
import pandas as pd | ||
from pandas import Series, DataFrame | ||
|
||
# deepchem data imports | ||
from deepchem.molnet import load_qm9 | ||
|
||
def generate_analytical_data(to_csv = True): | ||
|
||
x1 = np.random.uniform(0, 10, 10000) | ||
x2 = np.random.uniform(0, 10, 10000) | ||
mu = 0 | ||
sigma = (x1+x2)/10 | ||
def generate_analytical_data(to_csv=True): | ||
""" | ||
Generate noisy analytical data. | ||
This function generates random x1 and x2 as data features, | ||
creates and analytical target value using the f function, | ||
and then adds noise to that value using g. | ||
""" | ||
x1 = np.random.uniform(0, 10, 10000) | ||
x2 = np.random.uniform(0, 10, 10000) | ||
mu = 0 | ||
sigma = (x1 + x2) / 100 | ||
|
||
def f(x1, x2): | ||
f_data = 2*x1 + x2 | ||
return f_data | ||
|
||
def g(x1, x2): | ||
g_data = np.random.normal(mu, np.abs(sigma), 1000) | ||
return g_data | ||
|
||
def f(x1, x2): | ||
f_data = np.sin(x1)+np.cos(x2) | ||
return f_data | ||
noise = g(x1, x2) | ||
y = f(x1, x2) + noise | ||
|
||
def g(x1, x2): | ||
g_data = np.random.normal(mu, np.abs(sigma), 10000) | ||
return g_data | ||
gen_data = pd.DataFrame({'X1': x1, 'X2': x2, 'Y': y}) | ||
|
||
y = f(x1, x2) + g(x1, x2) | ||
if to_csv: | ||
gen_data.to_csv("analytical_data.csv", index=False, sep=',') | ||
# read in using gen_data = pd.read_csv("analytical_data.csv") | ||
return gen_data, noise | ||
|
||
gen_data = pd.DataFrame({'X1':x1, 'X2':x2, 'Y':y}) | ||
|
||
if to_csv: | ||
gen_data.to_csv("analytical_data.csv", index=False, sep = ',') | ||
|
||
return gen_data | ||
def generate_qm9_noise_data(x1, x2, y, to_csv=True): | ||
""" | ||
Generate noisy QM9 data. | ||
This function takes in x1 and x2, which correspond to | ||
the data columns in QM9 to use as data (in [1,12]). | ||
def generate_qm9_noise_data(x1,x2,y): | ||
The y is the column to use as the target, also in [1,12]. | ||
""" | ||
# load data | ||
qm9_tasks, datasets, transformers = load_qm9() | ||
train_dataset, valid_dataset, test_dataset = datasets | ||
|
||
#load data | ||
qm9_tasks, datasets, transformers = load_qm9() | ||
train_dataset, valid_dataset, test_dataset = datasets | ||
c1 = qm9_tasks[x1 - 1] | ||
c2 = qm9_tasks[x2 - 1] | ||
c3 = qm9_tasks[y - 1] | ||
|
||
c1=qm9_tasks[x1-1] | ||
c2=qm9_tasks[x2-1] | ||
c3=qm9_tasks[y-1] | ||
|
||
#extrct the 'y'values | ||
Y = test_dataset.y | ||
YT = Y.T | ||
# extrct the 'y'values | ||
Y = test_dataset.y | ||
YT = Y.T | ||
|
||
X1 = YT[x1-1] | ||
X2 = YT[x2-1] | ||
Y_a = YT[y-1] | ||
X1 = YT[x1 - 1] | ||
X2 = YT[x2 - 1] | ||
Y_a = YT[y - 1] | ||
|
||
x1 = X1.tolist() | ||
x2 = X2.tolist() | ||
y_l = Y_a.tolist() | ||
l = Y_a.shape | ||
x1 = X1.tolist() | ||
x2 = X2.tolist() | ||
y_l = Y_a.tolist() | ||
length = len(Y_a) | ||
|
||
n = np.random.uniform(0, l, 1).astype(np.int) #set the number of noise added | ||
ni = np.random.uniform(0, l, n) #n random values | ||
an = len(ni) | ||
|
||
# add noise to n numbers of y | ||
Noise = [] | ||
for i in range(length): | ||
mu = 0 | ||
sigma = (x1[i] + x2[i]) / 2 | ||
noise = np.random.normal(mu, np.abs(sigma), length) | ||
g = noise.tolist() | ||
Noise.append(g[i]) | ||
y_l[i] += g[i] | ||
|
||
#add noise to n numbers of y | ||
for i in range(an): | ||
mu = 0 | ||
sigma = (x1[i]+x2[i])/2 | ||
noise = np.random.normal(mu, np.abs(sigma), n) | ||
g = noise.tolist() | ||
y_l[i] += g[i] | ||
# save to_csv: | ||
gen_data = pd.DataFrame({f'x1_{c1}': X1, f'x2_{c2}': X2, f'y_{c3}': y_l}) | ||
|
||
#save to_csv: | ||
dataframe = pd.DataFrame({'x1_'+c1:X1,'x2_'+c2:X2,'y_'+c3:y_l}) | ||
dataframe.to_csv("qm9_noise_data.csv", index=False, sep = ',') | ||
gen_data = pd.read_csv('qm9_noise_data.csv') | ||
if to_csv: | ||
gen_data.to_csv("qm9_noise_data.csv", index=False, sep=',') | ||
# read in using gen_data = pd.read_csv('qm9_noise_data.csv') | ||
|
||
return gen_data | ||
return gen_data, Noise |