From 1f9a7649b6448fc766bbc22bcc66b7485a2c2326 Mon Sep 17 00:00:00 2001 From: Samantha Tetef Date: Mon, 15 Mar 2021 11:08:01 -0700 Subject: [PATCH 1/2] PEP8 compliance --- gandy/quality_est/datagen.py | 132 +++++++++++++++++++++-------------- 1 file changed, 79 insertions(+), 53 deletions(-) diff --git a/gandy/quality_est/datagen.py b/gandy/quality_est/datagen.py index b0dac73..f832502 100644 --- a/gandy/quality_est/datagen.py +++ b/gandy/quality_est/datagen.py @@ -1,74 +1,100 @@ -import deepchem as dc +""" +This module generates noisy data. + +Data is either generated analytically or from the QM9 dataset. +""" +# standard imports import numpy as np -import random import pandas as pd -from pandas import Series, DataFrame + +# deepchem data imports from deepchem.molnet import load_qm9 -def generate_analytical_data(to_csv = True): - x1 = np.random.uniform(0, 1000, 1000) - x2 = np.random.uniform(0, 1000, 1000) - mu = 0 - sigma = (x1+x2)/2 +def generate_analytical_data(to_csv=True): + """ + Generate noisy analytical data. + + This function generates random x1 and x2 as data features, + creates and analytical target value using the f function, + and then adds noise to that value using g. + """ + x1 = np.random.uniform(0, 1000, 1000) + x2 = np.random.uniform(0, 1000, 1000) + mu = 0 + sigma = (x1 + x2) / 2 + + def f(x1, x2): + f_data = np.sin(x1) + np.cos(x2) + return f_data + + def g(x1, x2): + g_data = np.random.normal(mu, np.abs(sigma), 1000) + return g_data + + noise = g(x1, x2) + y = f(x1, x2) + noise - def f(x1, x2): - f_data = np.sin(x1)+np.cos(x2) - return f_data + gen_data = pd.DataFrame({'X1': x1, 'X2': x2, 'Y': y}) - def g(x1, x2): - g_data = np.random.normal(mu, np.abs(sigma), 1000) - return g_data + if to_csv: + gen_data.to_csv("analytical_data.csv", index=False, sep=',') + # read in using gen_data = pd.read_csv("analytical_data.csv") - y = f(x1, x2) + g(x1, x2) + return gen_data, noise - gen_data = pd.DataFrame({'X1':x1, 'X2':x2, 'Y':y}) - - if to_csv: - gen_data.to_csv("analytical_data.csv", index=False, sep = ',') - return gen_data +def generate_qm9_noise_data(x1, x2, y, to_csv=True): + """ + Generate noisy QM9 data. + This function takes in x1 and x2, which correspond to + the data columns in QM9 to use as data (in [1,12]). -def generate_qm9_noise_data(x1,x2,y): + The y is the column to use as the target, also in [1,12]. + """ + # load data + qm9_tasks, datasets, transformers = load_qm9() + train_dataset, valid_dataset, test_dataset = datasets - #load data - qm9_tasks, datasets, transformers = load_qm9() - train_dataset, valid_dataset, test_dataset = datasets + c1 = qm9_tasks[x1 - 1] + c2 = qm9_tasks[x2 - 1] + c3 = qm9_tasks[y - 1] - c1=qm9_tasks[x1-1] - c2=qm9_tasks[x2-1] - c3=qm9_tasks[y-1] - - #extrct the 'y'values - Y = test_dataset.y - YT = Y.T + # extrct the 'y'values + Y = test_dataset.y + YT = Y.T - X1 = YT[x1-1] - X2 = YT[x2-1] - Y_a = YT[y-1] + X1 = YT[x1 - 1] + X2 = YT[x2 - 1] + Y_a = YT[y - 1] - x1 = X1.tolist() - x2 = X2.tolist() - y_l = Y_a.tolist() - l = Y_a.shape + x1 = X1.tolist() + x2 = X2.tolist() + y_l = Y_a.tolist() + length = Y_a.shape - n = np.random.uniform(0, l, 1).astype(np.int) #set the number of noise added - ni = np.random.uniform(0, l, n) #n random values - an = len(ni) + # set the number of noise added + n = np.random.uniform(0, length, 1).astype(np.int) + # n random values + ni = np.random.uniform(0, length, n) + an = len(ni) + # add noise to n numbers of y + Noise = [] + for i in range(an): + mu = 0 + sigma = (x1[i] + x2[i]) / 2 + noise = np.random.normal(mu, np.abs(sigma), n) + g = noise.tolist() + Noise.append(g[i]) + y_l[i] += g[i] - #add noise to n numbers of y - for i in range(an): - mu = 0 - sigma = (x1[i]+x2[i])/2 - noise = np.random.normal(mu, np.abs(sigma), n) - g = noise.tolist() - y_l[i] += g[i] + # save to_csv: + gen_data = pd.DataFrame({f'x1_{c1}': X1, f'x2_{c2}': X2, f'y_{c3}': y_l}) - #save to_csv: - dataframe = pd.DataFrame({'x1_'+c1:X1,'x2_'+c2:X2,'y_'+c3:y_l}) - dataframe.to_csv("qm9_noise_data.csv", index=False, sep = ',') - gen_data = pd.read_csv('qm9_noise_data.csv') + if to_csv: + gen_data.to_csv("qm9_noise_data.csv", index=False, sep=',') + # read in using gen_data = pd.read_csv('qm9_noise_data.csv') - return gen_data + return gen_data, Noise From 59da2cb437f97337b62ebe28ec8cb9f34959cc24 Mon Sep 17 00:00:00 2001 From: Yuchi Fang Date: Tue, 16 Mar 2021 03:53:49 +0800 Subject: [PATCH 2/2] modified noise part --- gandy/quality_est/datagen.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/gandy/quality_est/datagen.py b/gandy/quality_est/datagen.py index f832502..887e83f 100644 --- a/gandy/quality_est/datagen.py +++ b/gandy/quality_est/datagen.py @@ -72,20 +72,15 @@ def generate_qm9_noise_data(x1, x2, y, to_csv=True): x1 = X1.tolist() x2 = X2.tolist() y_l = Y_a.tolist() - length = Y_a.shape + length = len(Y_a) - # set the number of noise added - n = np.random.uniform(0, length, 1).astype(np.int) - # n random values - ni = np.random.uniform(0, length, n) - an = len(ni) # add noise to n numbers of y Noise = [] - for i in range(an): + for i in range(length): mu = 0 sigma = (x1[i] + x2[i]) / 2 - noise = np.random.normal(mu, np.abs(sigma), n) + noise = np.random.normal(mu, np.abs(sigma), length) g = noise.tolist() Noise.append(g[i]) y_l[i] += g[i]