Skip to content

Commit 9dd5750

Browse files
authored
Symbolic regression classification generator
1 parent e0fe378 commit 9dd5750

File tree

1 file changed

+176
-0
lines changed

1 file changed

+176
-0
lines changed
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
2+
# coding: utf-8
3+
4+
# Evaluate a polynomial string
5+
6+
def symbolize(s):
7+
"""
8+
Converts a a string (equation) to a SymPy symbol object
9+
"""
10+
from sympy import sympify
11+
s1=s.replace('.','*')
12+
s2=s1.replace('^','**')
13+
s3=sympify(s2)
14+
15+
return(s3)
16+
17+
18+
def eval_multinomial(s,vals=None,symbolic_eval=False):
19+
"""
20+
Evaluates polynomial at vals.
21+
vals can be simple list, dictionary, or tuple of values.
22+
vals can also contain symbols instead of real values provided those symbols have been declared before using SymPy
23+
"""
24+
from sympy import Symbol
25+
sym_s=symbolize(s)
26+
sym_set=sym_s.atoms(Symbol)
27+
sym_lst=[]
28+
for s in sym_set:
29+
sym_lst.append(str(s))
30+
sym_lst.sort()
31+
if symbolic_eval==False and len(sym_set)!=len(vals):
32+
print("Length of the input values did not match number of variables and symbolic evaluation is not selected")
33+
return None
34+
else:
35+
if type(vals)==list:
36+
sub=list(zip(sym_lst,vals))
37+
elif type(vals)==dict:
38+
l=list(vals.keys())
39+
l.sort()
40+
lst=[]
41+
for i in l:
42+
lst.append(vals[i])
43+
sub=list(zip(sym_lst,lst))
44+
elif type(vals)==tuple:
45+
sub=list(zip(sym_lst,list(vals)))
46+
result=sym_s.subs(sub)
47+
48+
return result
49+
50+
51+
# ### Helper function for flipping binary values of a _ndarray_
52+
53+
def flip(y,p):
54+
import numpy as np
55+
lst=[]
56+
for i in range(len(y)):
57+
f=np.random.choice([1,0],p=[p,1-p])
58+
lst.append(f)
59+
lst=np.array(lst)
60+
return np.array(np.logical_xor(y,lst),dtype=int)
61+
62+
63+
# ### Classification sample generation based on a symbolic expression
64+
65+
def gen_classification_symbolic(m=None,n_samples=100,n_features=2,flip_y=0.0):
66+
"""
67+
Generates classification sample based on a symbolic expression.
68+
Calculates the output of the symbolic expression at randomly generated (Gaussian distribution) points and
69+
assigns binary classification based on sign.
70+
m: The symbolic expression. Needs x1, x2, etc as variables and regular python arithmatic symbols to be used.
71+
n_samples: Number of samples to be generated
72+
n_features: Number of variables. This is automatically inferred from the symbolic expression. So this is ignored
73+
in case a symbolic expression is supplied. However if no symbolic expression is supplied then a
74+
default simple polynomial can be invoked to generate classification samples with n_features.
75+
flip_y: Probability of flipping the classification labels randomly. A higher value introduces more noise and make
76+
the classification problem harder.
77+
Returns a numpy ndarray with dimension (n_samples,n_features+1). Last column is the response vector.
78+
"""
79+
80+
import numpy as np
81+
from sympy import Symbol,sympify
82+
83+
if m==None:
84+
m=''
85+
for i in range(1,n_features+1):
86+
c='x'+str(i)
87+
c+=np.random.choice(['+','-'],p=[0.5,0.5])
88+
m+=c
89+
m=m[:-1]
90+
sym_m=sympify(m)
91+
n_features=len(sym_m.atoms(Symbol))
92+
evals=[]
93+
lst_features=[]
94+
for i in range(n_features):
95+
lst_features.append(np.random.normal(scale=5,size=n_samples))
96+
lst_features=np.array(lst_features)
97+
lst_features=lst_features.T
98+
for i in range(n_samples):
99+
evals.append(eval_multinomial(m,vals=list(lst_features[i])))
100+
101+
evals=np.array(evals)
102+
evals_binary=evals>0
103+
evals_binary=evals_binary.flatten()
104+
evals_binary=np.array(evals_binary,dtype=int)
105+
evals_binary=flip(evals_binary,p=flip_y)
106+
evals_binary=evals_binary.reshape(n_samples,1)
107+
108+
lst_features=lst_features.reshape(n_samples,n_features)
109+
x=np.hstack((lst_features,evals_binary))
110+
111+
return (x)
112+
113+
# ### Regression sample generation based on a symbolic expression
114+
115+
116+
def gen_regression_symbolic(m=None,n_samples=100,n_features=2,noise=0.0,noise_dist='normal'):
117+
"""
118+
Generates regression sample based on a symbolic expression. Calculates the output of the symbolic expression
119+
at randomly generated (drawn from a Gaussian distribution) points
120+
m: The symbolic expression. Needs x1, x2, etc as variables and regular python arithmatic symbols to be used.
121+
n_samples: Number of samples to be generated
122+
n_features: Number of variables. This is automatically inferred from the symbolic expression. So this is ignored
123+
in case a symbolic expression is supplied. However if no symbolic expression is supplied then a
124+
default simple polynomial can be invoked to generate regression samples with n_features.
125+
noise: Magnitude of Gaussian noise to be introduced (added to the output).
126+
noise_dist: Type of the probability distribution of the noise signal.
127+
Currently supports: Normal, Uniform, t, Beta, Gamma, Poission, Laplace
128+
129+
Returns a numpy ndarray with dimension (n_samples,n_features+1). Last column is the response vector.
130+
"""
131+
132+
import numpy as np
133+
from sympy import Symbol,sympify
134+
135+
if m==None:
136+
m=''
137+
for i in range(1,n_features+1):
138+
c='x'+str(i)
139+
c+=np.random.choice(['+','-'],p=[0.5,0.5])
140+
m+=c
141+
m=m[:-1]
142+
143+
sym_m=sympify(m)
144+
n_features=len(sym_m.atoms(Symbol))
145+
evals=[]
146+
lst_features=[]
147+
148+
for i in range(n_features):
149+
lst_features.append(np.random.normal(scale=5,size=n_samples))
150+
lst_features=np.array(lst_features)
151+
lst_features=lst_features.T
152+
lst_features=lst_features.reshape(n_samples,n_features)
153+
154+
for i in range(n_samples):
155+
evals.append(eval_multinomial(m,vals=list(lst_features[i])))
156+
157+
evals=np.array(evals)
158+
evals=evals.reshape(n_samples,1)
159+
160+
if noise_dist=='normal':
161+
noise_sample=noise*np.random.normal(loc=0,scale=1.0,size=n_samples)
162+
elif noise_dist=='uniform':
163+
noise_sample=noise*np.random.uniform(low=0,high=1.0,size=n_samples)
164+
elif noise_dist=='beta':
165+
noise_sample=noise*np.random.beta(a=0.5,b=1.0,size=n_samples)
166+
elif noise_dist=='Gamma':
167+
noise_sample=noise*np.random.gamma(shape=1.0,scale=1.0,size=n_samples)
168+
elif noise_dist=='laplace':
169+
noise_sample=noise*np.random.laplace(loc=0.0,scale=1.0,size=n_samples)
170+
171+
noise_sample=noise_sample.reshape(n_samples,1)
172+
evals=evals+noise_sample
173+
174+
x=np.hstack((lst_features,evals))
175+
176+
return (x)

0 commit comments

Comments
 (0)