Add MACHO data

bthtsang · Nov 12, 2017 · dcc0ece · dcc0ece
1 parent 336c9cf
commit dcc0ece
Show file tree

Hide file tree

Showing 7 changed files with 452 additions and 3,930 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,5 @@
 # Neural network autoencoders for unevenly sampled time series
-Code accompanying "An unsupervised neural network that outperforms on
-classification of unevenly sampled time series".
+Code accompanying "A recurrent neural network for classification of unevenly sampled variable stars".
 
 - Code for scores/figures is found in `figures.ipynb`
 - Autoencoder network architecture is defined in `autoencoder.py`

diff --git a/data/macho/full.pkl b/data/macho/full.pkl
diff --git a/figures.ipynb b/figures.ipynb
diff --git a/keras_logs/macho_fold/n200_ss0.7/gru_096_x2_5m04_drop25_emb64_bidir/weights.h5 b/keras_logs/macho_fold/n200_ss0.7/gru_096_x2_5m04_drop25_emb64_bidir/weights.h5
diff --git a/light_curve.py b/light_curve.py
@@ -58,10 +58,13 @@ def fit_lomb_scargle(self):
     def fit_supersmoother(self, periodic=True, scale=True):
         from supersmoother import SuperSmoother
         model = SuperSmoother(period=self.p if periodic else None)
-        model.fit(self.times, self.measurements, self.errors)
-        self.ss_resid = np.sqrt(np.mean((model.predict(self.times) - self.measurements) ** 2))
-        if scale:
-            self.ss_resid /= np.std(self.measurements)
+        try:
+            model.fit(self.times, self.measurements, self.errors)
+            self.ss_resid = np.sqrt(np.mean((model.predict(self.times) - self.measurements) ** 2))
+            if scale:
+                self.ss_resid /= np.std(self.measurements)
+        except ValueError:
+            self.ss_resid = np.inf
 
     def period_fold(self, p=None):
         if p is None:
@@ -129,9 +132,63 @@ def load_linear():
         return light_curves
 
 
+    def load_macho():
+        header_fname = 'data/macho/machovar.dat'
+        light_curves = []
+        header = pd.read_table(header_fname, header=None, delim_whitespace=True)
+        colnames = ['Field', 'Tile', 'Seqn', 'RA_DEC', 'rPer', 'bPer', 'Vmag',
+                    'Rmag', 'rAmp', 'bAmp', 'cAmp', 'rSupRSA', 'bSupRSA', 'rchi2',
+                    'bchi2', 'rsig', 'bsig', 'Var', 'Class', 'Points', 'cPoints',
+                    'rPoints', 'bPoints']
+        header.columns = colnames
+        header.index = ['.'.join(str(el) for el in row)
+                        for row in header.values[:, :3]]
+        LC_types = {
+            1: 'RRL AB',
+            2: 'RRL C',
+            3: 'RRL E',
+            4: 'Ceph Fund',
+            5: 'Ceph 1st',
+            6: 'LPV WoodA',
+            7: 'LPV WoodB',
+            8: 'LPV WoodC',
+            9: 'LPV WoodD',
+           10: 'EB',
+           11: 'RRL + GB',
+        }
+
+        import datetime
+        for i, fname in enumerate(glob.glob('/fastdisks/bnaul/*.txt')):
+            if i % 100 == 0:
+                print(f"{i:5d}/{header.shape[0]}", datetime.datetime.now())
+            df = pd.read_csv(fname, sep=';', header=None)
+            df.columns = ['t', 'mr', 'er', 'mb', 'eb']
+            df.drop_duplicates(subset=['t'], keep='first', inplace=True)
+            df.values[(df.values[:, 1] < -50) | (df.values[:, 2] > 9), 1:3] = np.nan
+            df.values[(df.values[:, 3] < -50) | (df.values[:, 4] > 9), 3:5] = np.nan
+            if np.isnan(df.values[:, 1]).all():
+                continue
+            df = df[~np.isnan(df['mr'])]
+            name = '.'.join(os.path.splitext(os.path.basename(fname))[0].split('_')[1:])
+            inds = np.argsort(df['t'])
+            lc = LightCurve(name=name, survey='MACHO', times=df['t'].values[inds],
+                            measurements=df['mr'].values[inds],
+                            errors=df['er'].values[inds])
+            lc.label = LC_types[header.Class.loc[lc.name]]
+#            lc.fit_lomb_scargle()
+            lc.p = header.rPer.loc[lc.name]
+            lc.fit_supersmoother()
+            light_curves.append(lc)
+        return light_curves
+
+
+
+
 if __name__ == "__main__":
     print("Adding light curve data")
 #    light_curves = LightCurve.load_asas()
 #    joblib.dump(light_curves, 'asas.pkl', compress=3)
-    light_curves = LightCurve.load_linear()
-    joblib.dump(light_curves, 'linear.pkl', compress=3)
+#    light_curves = LightCurve.load_linear()
+#    joblib.dump(light_curves, 'linear.pkl', compress=3)
+    light_curves = LightCurve.load_macho()
+    joblib.dump(light_curves, 'macho.pkl', compress=3)
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
 numpy
-pandas
+pandas==0.19.2
 scikit-learn
 tensorflow
 keras==1.2.2

diff --git a/survey_autoencoder.py b/survey_autoencoder.py
@@ -9,12 +9,11 @@
 from light_curve import LightCurve
 
 
-def preprocess(X_raw, m_max=None):
+def preprocess(X_raw, m_max=np.inf):
     X = X_raw.copy()
 
-    if m_max:
-        wrong_units = np.nanmax(X[:, :, 1], axis=1) > m_max
-        X = X[~wrong_units, :, :]
+    wrong_units =  np.all(np.isnan(X[:, :, 1])) | (np.nanmax(X[:, :, 1], axis=1) > m_max)
+    X = X[~wrong_units, :, :]
 
     # Replace times w/ lags
     X[:, :, 0] = ku.times_to_lags(X[:, :, 0])
@@ -60,6 +59,8 @@ def main(args=None):
     X_list = [np.c_[lc.times, lc.measurements, lc.errors] for lc in split]
 
     X_raw = pad_sequences(X_list, value=np.nan, dtype='float', padding='post')
+    if args.N_train is not None:
+        X_raw = X_raw[:args.N_train]
 
     model_type_dict = {'gru': GRU, 'lstm': LSTM, 'vanilla': SimpleRNN}
     X, means, scales, wrong_units = preprocess(X_raw, args.m_max)