Merge pull request #85 from merekat/michel

cleaning
KoppAlexander · Jul 12, 2024 · 5f64da8 · 5f64da8
2 parents 960ed85 + 893331c
commit 5f64da8
Show file tree

Hide file tree

Showing 4 changed files with 50 additions and 235 deletions.
diff --git a/README.md b/README.md
@@ -2,7 +2,17 @@
 
 ## Flight Prediction Test on Airport Data from Tunesian Airline
 
-Based on several machine learning classifier this project tries to predict delays of individual airplanes. 
+Based on several machine learning classifier this project tries to predict delays of individual airplanes.
+
+### Set up the Presentation
+
+- Thre presentation can be started with streamlit. Make sure to have streamlit installed in your directory, as described in the requirements. 
+
+     ```BASH
+    streamlit run app.py
+    ```
+    After that a local host is started in your standard browser. 
+
 
 
 ## Set up your Environment

diff --git a/example_files/train.py b/example_files/train.py
@@ -194,7 +194,6 @@
 duplicate_columns = df.columns[df.columns.duplicated()]
 df = df.loc[:, ~df.columns.duplicated()]
 
-#Target engeneering
 # Convert target into certain category intervals
 
 def target_interval(row):
@@ -212,7 +211,7 @@ def target_interval(row):
         return 6  
 
 df['target_cat'] = df.apply(target_interval, axis=1)
-
+    
 # Standardization
 
 # Create a StandardScaler object
@@ -231,7 +230,43 @@ def target_interval(row):
 X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=RSEED)
 
 # Train model
+# Define the parameter distribution for random search
+param_dist = {
+    'n_estimators': randint(50, 100),  # Reduced upper bound
+    'learning_rate': uniform(0.01, 0.5),  # Reduced upper bound
+    'base_estimator__max_depth': randint(1, 5),  # Reduced upper bound
+    'base_estimator__min_samples_split': randint(2, 10),  # Reduced upper bound
+    'base_estimator__min_samples_leaf': randint(1, 10),  # Reduced upper bound
+    'algorithm': ['SAMME', 'SAMME.R']
+}
 
+# Create a base model
+base_estimator = DecisionTreeClassifier(random_state=RSEED)
+ada = AdaBoostClassifier(base_estimator=base_estimator, random_state=RSEED)
+
+# Create a custom scorer (you can change this to other metrics if needed)
+scorer = make_scorer(f1_score)
+
+# Instantiate RandomizedSearchCV object
+random_search = RandomizedSearchCV(
+    estimator=ada,
+    param_distributions=param_dist,
+    n_iter=50,  # Reduced number of iterations
+    cv=3,  # Reduced number of cross-validation folds
+    scoring=scorer,
+    random_state=RSEED,
+    n_jobs=-1  # use all available cores
+)
+
+# Fit RandomizedSearchCV
+random_search.fit(X_train, y_train)
+
+# Print the best parameters and score
+print("Best parameters:", random_search.best_params_)
+print("Best cross-validation score:", random_search.best_score_)
+
+# Get the best model
+model = random_search.best_estimator_
 
 # Save the model
 dump(model, 'models/model.joblib')
diff --git a/project_classification.ipynb b/project_classification.ipynb
diff --git a/requirements.txt b/requirements.txt
@@ -9,4 +9,5 @@ jupyterlab-dash==0.1.0a3
 scikit-learn==1.2.2
 statsmodels==0.13.5
 pytest==7.3.1
-xgboost==1.24.3
+xgboost==1.24.3
+streamlit==1.36.0