|
| 1 | + |
| 2 | +# coding: utf-8 |
| 3 | + |
| 4 | +# Evaluate a polynomial string |
| 5 | + |
| 6 | +def symbolize(s): |
| 7 | + """ |
| 8 | + Converts a a string (equation) to a SymPy symbol object |
| 9 | + """ |
| 10 | + from sympy import sympify |
| 11 | + s1=s.replace('.','*') |
| 12 | + s2=s1.replace('^','**') |
| 13 | + s3=sympify(s2) |
| 14 | + |
| 15 | + return(s3) |
| 16 | + |
| 17 | + |
| 18 | +def eval_multinomial(s,vals=None,symbolic_eval=False): |
| 19 | + """ |
| 20 | + Evaluates polynomial at vals. |
| 21 | + vals can be simple list, dictionary, or tuple of values. |
| 22 | + vals can also contain symbols instead of real values provided those symbols have been declared before using SymPy |
| 23 | + """ |
| 24 | + from sympy import Symbol |
| 25 | + sym_s=symbolize(s) |
| 26 | + sym_set=sym_s.atoms(Symbol) |
| 27 | + sym_lst=[] |
| 28 | + for s in sym_set: |
| 29 | + sym_lst.append(str(s)) |
| 30 | + sym_lst.sort() |
| 31 | + if symbolic_eval==False and len(sym_set)!=len(vals): |
| 32 | + print("Length of the input values did not match number of variables and symbolic evaluation is not selected") |
| 33 | + return None |
| 34 | + else: |
| 35 | + if type(vals)==list: |
| 36 | + sub=list(zip(sym_lst,vals)) |
| 37 | + elif type(vals)==dict: |
| 38 | + l=list(vals.keys()) |
| 39 | + l.sort() |
| 40 | + lst=[] |
| 41 | + for i in l: |
| 42 | + lst.append(vals[i]) |
| 43 | + sub=list(zip(sym_lst,lst)) |
| 44 | + elif type(vals)==tuple: |
| 45 | + sub=list(zip(sym_lst,list(vals))) |
| 46 | + result=sym_s.subs(sub) |
| 47 | + |
| 48 | + return result |
| 49 | + |
| 50 | + |
| 51 | +# ### Helper function for flipping binary values of a _ndarray_ |
| 52 | + |
| 53 | +def flip(y,p): |
| 54 | + import numpy as np |
| 55 | + lst=[] |
| 56 | + for i in range(len(y)): |
| 57 | + f=np.random.choice([1,0],p=[p,1-p]) |
| 58 | + lst.append(f) |
| 59 | + lst=np.array(lst) |
| 60 | + return np.array(np.logical_xor(y,lst),dtype=int) |
| 61 | + |
| 62 | + |
| 63 | +# ### Classification sample generation based on a symbolic expression |
| 64 | + |
| 65 | +def gen_classification_symbolic(m=None,n_samples=100,n_features=2,flip_y=0.0): |
| 66 | + """ |
| 67 | + Generates classification sample based on a symbolic expression. |
| 68 | + Calculates the output of the symbolic expression at randomly generated (Gaussian distribution) points and |
| 69 | + assigns binary classification based on sign. |
| 70 | + m: The symbolic expression. Needs x1, x2, etc as variables and regular python arithmatic symbols to be used. |
| 71 | + n_samples: Number of samples to be generated |
| 72 | + n_features: Number of variables. This is automatically inferred from the symbolic expression. So this is ignored |
| 73 | + in case a symbolic expression is supplied. However if no symbolic expression is supplied then a |
| 74 | + default simple polynomial can be invoked to generate classification samples with n_features. |
| 75 | + flip_y: Probability of flipping the classification labels randomly. A higher value introduces more noise and make |
| 76 | + the classification problem harder. |
| 77 | + Returns a numpy ndarray with dimension (n_samples,n_features+1). Last column is the response vector. |
| 78 | + """ |
| 79 | + |
| 80 | + import numpy as np |
| 81 | + from sympy import Symbol,sympify |
| 82 | + |
| 83 | + if m==None: |
| 84 | + m='' |
| 85 | + for i in range(1,n_features+1): |
| 86 | + c='x'+str(i) |
| 87 | + c+=np.random.choice(['+','-'],p=[0.5,0.5]) |
| 88 | + m+=c |
| 89 | + m=m[:-1] |
| 90 | + sym_m=sympify(m) |
| 91 | + n_features=len(sym_m.atoms(Symbol)) |
| 92 | + evals=[] |
| 93 | + lst_features=[] |
| 94 | + for i in range(n_features): |
| 95 | + lst_features.append(np.random.normal(scale=5,size=n_samples)) |
| 96 | + lst_features=np.array(lst_features) |
| 97 | + lst_features=lst_features.T |
| 98 | + for i in range(n_samples): |
| 99 | + evals.append(eval_multinomial(m,vals=list(lst_features[i]))) |
| 100 | + |
| 101 | + evals=np.array(evals) |
| 102 | + evals_binary=evals>0 |
| 103 | + evals_binary=evals_binary.flatten() |
| 104 | + evals_binary=np.array(evals_binary,dtype=int) |
| 105 | + evals_binary=flip(evals_binary,p=flip_y) |
| 106 | + evals_binary=evals_binary.reshape(n_samples,1) |
| 107 | + |
| 108 | + lst_features=lst_features.reshape(n_samples,n_features) |
| 109 | + x=np.hstack((lst_features,evals_binary)) |
| 110 | + |
| 111 | + return (x) |
| 112 | + |
| 113 | +# ### Regression sample generation based on a symbolic expression |
| 114 | + |
| 115 | + |
| 116 | +def gen_regression_symbolic(m=None,n_samples=100,n_features=2,noise=0.0,noise_dist='normal'): |
| 117 | + """ |
| 118 | + Generates regression sample based on a symbolic expression. Calculates the output of the symbolic expression |
| 119 | + at randomly generated (drawn from a Gaussian distribution) points |
| 120 | + m: The symbolic expression. Needs x1, x2, etc as variables and regular python arithmatic symbols to be used. |
| 121 | + n_samples: Number of samples to be generated |
| 122 | + n_features: Number of variables. This is automatically inferred from the symbolic expression. So this is ignored |
| 123 | + in case a symbolic expression is supplied. However if no symbolic expression is supplied then a |
| 124 | + default simple polynomial can be invoked to generate regression samples with n_features. |
| 125 | + noise: Magnitude of Gaussian noise to be introduced (added to the output). |
| 126 | + noise_dist: Type of the probability distribution of the noise signal. |
| 127 | + Currently supports: Normal, Uniform, t, Beta, Gamma, Poission, Laplace |
| 128 | +
|
| 129 | + Returns a numpy ndarray with dimension (n_samples,n_features+1). Last column is the response vector. |
| 130 | + """ |
| 131 | + |
| 132 | + import numpy as np |
| 133 | + from sympy import Symbol,sympify |
| 134 | + |
| 135 | + if m==None: |
| 136 | + m='' |
| 137 | + for i in range(1,n_features+1): |
| 138 | + c='x'+str(i) |
| 139 | + c+=np.random.choice(['+','-'],p=[0.5,0.5]) |
| 140 | + m+=c |
| 141 | + m=m[:-1] |
| 142 | + |
| 143 | + sym_m=sympify(m) |
| 144 | + n_features=len(sym_m.atoms(Symbol)) |
| 145 | + evals=[] |
| 146 | + lst_features=[] |
| 147 | + |
| 148 | + for i in range(n_features): |
| 149 | + lst_features.append(np.random.normal(scale=5,size=n_samples)) |
| 150 | + lst_features=np.array(lst_features) |
| 151 | + lst_features=lst_features.T |
| 152 | + lst_features=lst_features.reshape(n_samples,n_features) |
| 153 | + |
| 154 | + for i in range(n_samples): |
| 155 | + evals.append(eval_multinomial(m,vals=list(lst_features[i]))) |
| 156 | + |
| 157 | + evals=np.array(evals) |
| 158 | + evals=evals.reshape(n_samples,1) |
| 159 | + |
| 160 | + if noise_dist=='normal': |
| 161 | + noise_sample=noise*np.random.normal(loc=0,scale=1.0,size=n_samples) |
| 162 | + elif noise_dist=='uniform': |
| 163 | + noise_sample=noise*np.random.uniform(low=0,high=1.0,size=n_samples) |
| 164 | + elif noise_dist=='beta': |
| 165 | + noise_sample=noise*np.random.beta(a=0.5,b=1.0,size=n_samples) |
| 166 | + elif noise_dist=='Gamma': |
| 167 | + noise_sample=noise*np.random.gamma(shape=1.0,scale=1.0,size=n_samples) |
| 168 | + elif noise_dist=='laplace': |
| 169 | + noise_sample=noise*np.random.laplace(loc=0.0,scale=1.0,size=n_samples) |
| 170 | + |
| 171 | + noise_sample=noise_sample.reshape(n_samples,1) |
| 172 | + evals=evals+noise_sample |
| 173 | + |
| 174 | + x=np.hstack((lst_features,evals)) |
| 175 | + |
| 176 | + return (x) |
0 commit comments