-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
sabofil
committed
May 3, 2024
1 parent
2a69b3e
commit 5f525f8
Showing
83 changed files
with
1,779 additions
and
11,111 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
2024 05 03 | ||
- handle missing data in yield without exluding if only on year missing | ||
- fix info leakage when using trend |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,173 @@ | ||
import pandas as pd | ||
import numpy as np | ||
import os | ||
import json | ||
|
||
|
||
class read: | ||
def __init__(self, full_path_config, run_name): | ||
|
||
with open(full_path_config, 'r') as fp: | ||
jdict = json.load(fp) | ||
self.AOI = jdict['AOI'] | ||
self.year_start = int(jdict['year_start']) | ||
self.year_end = int(jdict['year_end']) | ||
self.crops = jdict['crops'] | ||
self.data_dir = jdict['data_dir'] | ||
self.ope_data_dir = jdict['ope_data_dir'] | ||
self.output_dir = jdict['output_dir'] | ||
|
||
# run_stamp = datetime.datetime.today().strftime('%Y%m%d') | ||
self.ope_run_dir = os.path.join(self.output_dir, 'RUN_'+run_name +'_OPE') | ||
self.ope_run_out_dir = os.path.join(self.ope_run_dir, 'output') | ||
self.models_dir = os.path.join(self.output_dir, 'RUN_'+ run_name + '_TUNING') | ||
self.models_spec_dir = os.path.join(self.models_dir, 'Specs') | ||
self.models_out_dir = os.path.join(self.models_dir, 'Output') | ||
|
||
self.ivars = jdict['ivars'] | ||
self.ivars_short = jdict['ivars_short'] | ||
self.ivars_units = jdict['ivars_units'] | ||
self.sos = int(jdict['sos']) | ||
self.eos = int(jdict['eos']) | ||
self.yield_units = jdict['yield_units'] | ||
self.area_unit =jdict['area_unit'] | ||
# factor that divide production values to get production in desired units | ||
self.production_scaler =jdict['production_scaler'] | ||
|
||
|
||
|
||
class mlSettings: | ||
def __init__(self, forecastingMonths): | ||
# Define settings used in the ML workflow | ||
|
||
#set forcasting month (1 is the first) | ||
self.forecastingMonths = forecastingMonths | ||
|
||
# scikit, numbers of cores to be used when multi-thread is possible, at least 4 | ||
self.nJobsForGridSearchCv = 4 | ||
|
||
# Input data scaling. Admitted values: | ||
# z_f: z-score features | ||
# z_fl: z-score features and labels | ||
# z_fl_au: z-score features and labels by AU | ||
self.dataScaling = 'z_f' | ||
|
||
# The cost function. Values: 'neg_root_mean_squared_error' and ‘r2’ | ||
self.scoringMetric = 'neg_root_mean_squared_error' | ||
|
||
# Models are classified in | ||
# 1. benchmarks (Null_model, PeakNDVI, Trend) | ||
# 2. skModels (scikit-learn models). | ||
# Options and feature definition do not apply to benchmarks. | ||
# Benchmark model to be considered | ||
self.benchmarks = ['Null_model', 'PeakNDVI', 'Trend'] | ||
|
||
# feature groups to be considered | ||
rad_var = 'rad' #sometimes is 'Rad' | ||
self.feature_groups = { | ||
'rs_met': ['ND', 'NDmax', rad_var, 'RainSum', 'T', 'Tmin', 'Tmax'], | ||
'rs_met_reduced': ['ND', 'RainSum', 'T'], | ||
'rs_met_sm_reduced': ['ND', 'RainSum', 'T', 'SM'], # test of ZA | ||
'rs': ['ND', 'NDmax'], | ||
'rs_reduced': ['ND'], | ||
'rs_sm_reduced': ['ND', 'SM'], | ||
'met': [rad_var, 'RainSum', 'T', 'Tmin', 'Tmax'], | ||
'met_reduced': [rad_var, 'RainSum', 'T'], | ||
'met_sm_reduced': [rad_var, 'RainSum', 'T', 'SM'] | ||
} | ||
|
||
# dictionary for group labels used in plots | ||
self.feature_groups2labels = { | ||
'rs_met': 'RS&Met', | ||
'rs_met_reduced': 'RS&Met-', | ||
'rs_met_sm_reduced': 'SM&RS&Met-', | ||
'rs': 'RS', | ||
'rs_reduced': 'RS-', | ||
'rs_sm_reduced': 'SM&RS-', | ||
'met': 'Met', | ||
'met_reduced': 'Met-', | ||
'met_sm_reduced': 'SM&Met-' | ||
} | ||
|
||
# model configuration settings to be tested | ||
self.time_samplings = ['M'] # ["M"] # ['P', 'M'] | ||
|
||
# # y variables to be predicted | ||
#self.yvars = ['Yield'] # ['Yield', 'Production'] | ||
|
||
# Admin unit IDs OHE types to be tested | ||
self.doOHEs = ['none', 'AU_level'] # ['none', 'AU_level', 'Cluster_level'] | ||
|
||
# trend | ||
self.addYieldTrend = [True, False] | ||
# number of years of y data before the first year with features AND number of years for trend computation | ||
self.ny_max_trend = 12 | ||
|
||
# Add average y to feature set | ||
# An alternative way to pass the admin-level unobserved effect to the model. The y average by AU is used as an | ||
# additional feature. {old: In the outer CV leave one year out loop, it uses the mean of training y} | ||
# always scaled because features are scaled. It is alternative to DoOHEnc # (to be used only with DoOHEnc set to False). | ||
# Admitted values: False, True | ||
self.AddTargetMeanToFeature = False | ||
|
||
# Feature selection | ||
self.feature_selections = ['none', 'MRMR'] | ||
# percentage of features to be selected (as grid to be tested) | ||
self.feature_prct_grid = [5, 25, 50, 75, 100] | ||
|
||
# Data reduction with PCA | ||
self.dataReduction = ['none', 'PCA'] | ||
self.PCAprctVar2keep = 90 | ||
|
||
# Hyperparameters grid space | ||
self.hyperparopt = 'grid' | ||
if self.hyperparopt == 'grid': | ||
# Hidden layers treated as hyperparameters for NN MLP (1, 2 and 3 layers) | ||
# Set values that are exponents of two or values that can be divided by two | ||
hl2 = [(i, j) for i in [16, 32, 48, 64] for j in [16, 32, 48, 64]] | ||
hl3 = [[16, 32, 16], [16, 48, 16], [32, 48, 32], [32, 64, 32], [48, 64, 48], [32, 32, 32], [48, 48, 48], | ||
[64, 64, 64], | ||
[16, 16, 16]] | ||
hl = hl2 + hl3 | ||
self.hyperGrid = dict(Lasso={'alpha': np.logspace(-5, 0, 13, endpoint=True).tolist()}, | ||
RandomForest={'max_depth': [10, 15, 20, 25, 30, 35, 40], # Maximum number of levels in tree | ||
'max_features': ['auto', 'sqrt'], # Number of features to consider at every split | ||
'n_estimators': [100, 250, 500], # Number of trees in random forest | ||
'min_samples_split': np.linspace(0.2, 0.8, 6, endpoint=True).tolist()}, | ||
MLP={'alpha': np.logspace(-5, -1, 6, endpoint=True), | ||
'hidden_layer_sizes': hl, | ||
'activation': ['relu', 'tanh'], | ||
'learning_rate': ['constant', 'adaptive']}, | ||
|
||
# SVR_linear={'gamma': np.logspace(-2, 2, 2, endpoint=True).tolist(), | ||
# # gamma defines how much influence a single training example has. | ||
# # The larger gamma is, the closer other examples must be to be affected. | ||
# 'epsilon': np.logspace(-6, .5, 2, endpoint=True).tolist(), | ||
# # 'epsilon': np.logspace(-6, .5, 7, endpoint=True).tolist(), | ||
# 'C': [1, 100]}, | ||
SVR_linear={'gamma': np.logspace(-2, 2, 7, endpoint=True).tolist(), | ||
# gamma defines how much influence a single training example has. | ||
# The larger gamma is, the closer other examples must be to be affected. | ||
'epsilon': np.logspace(-6, .5, 7, endpoint=True).tolist(), | ||
'C': [1e-5, 1e-4, 1e-3, 1e-2, 1, 10, 100]}, | ||
# SVR_rbf={'gamma': np.logspace(-2, 2, 2, endpoint=True).tolist(), | ||
# 'epsilon': np.logspace(-6, .5, 2, endpoint=True).tolist(), | ||
# 'C': [1e-5, 100]}, | ||
SVR_rbf={'gamma': np.logspace(-2, 2, 7, endpoint=True).tolist(), | ||
'epsilon': np.logspace(-6, .5, 7, endpoint=True).tolist(), | ||
'C': [1e-5, 1e-4, 1e-3, 1e-2, 1, 10, 100]}, | ||
#GPR1={'alpha': [1e-10, 1e-5, 1e-1]}, | ||
#GPR2={'alpha': [1e-10, 1e-5, 1e-1, 0.05]}) | ||
GPR = {'alpha': [1e-10, 1e-5, 1e-1, 0.05]}, | ||
GBR={'learning_rate': [0.01, 0.05, 0.1], | ||
# Empirical evidence suggests that small values of learning_rate favor better test error. | ||
# [HTF] recommend to set the learning rate to a small constant (e.g. learning_rate <= 0.1) | ||
# and choose n_estimators by early stopping. | ||
'max_depth': [10, 20, 40], | ||
'n_estimators': [100, 250, 500], | ||
'min_samples_split': np.linspace(0.1, 0.8, 6, endpoint=True).tolist()}, | ||
XGBoost = {'learning_rate': [0.05, 0.1, 0.3, 0.5], | ||
'max_depth': [2, 4, 6, 8], | ||
'min_child_weight': [1, 3, 5]} | ||
#'gamma': [0, 2, 5]} | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
import time | ||
import os | ||
import glob | ||
import json | ||
|
||
from A_config import a10_config | ||
from C_model_setting import c100_save_model_specs | ||
from B_preprocess import b100_load | ||
from D_modelling import d100_modeller | ||
import time | ||
|
||
if __name__ == '__main__': | ||
''' | ||
This script shall be used to run these different runTypes: | ||
1) [tuning] tunes models with double LOYO loop (test various configuration) | ||
2) [opeTune] tunes the best model on all available years | ||
3) [opeForecast] run the operational yield of 2) using predictors only. Predictors are save in a | ||
different dir that can be updated and to avoid overwrite of features used for training | ||
''' | ||
|
||
start_time = time.time() | ||
# load region specific data info | ||
config = a10_config.read(r'V:\foodsec\Projects\SNYF\Algeria\MLYF\Algeria_config.json') | ||
# load model configurations to be tested | ||
modelSettings = a10_config.mlSettings(forecastingMonths = [7]) | ||
runType = 'tuning' # ['tuning', 'opeForecast'] | ||
|
||
if runType == 'tuning': | ||
################################################################################################################### | ||
# MODIFY HERE TO DO LESS TESTING | ||
want_keys = ['rs_met_reduced'] | ||
modelSettings.feature_groups = dict(filter(lambda x: x[0] in want_keys, modelSettings.feature_groups.items())) | ||
modelSettings.feature_prct_grid = [50, 75, 100] | ||
want_keys = ['Lasso', 'SVR_linear'] | ||
modelSettings.hyperGrid = dict(filter(lambda x: x[0] in want_keys, modelSettings.hyperGrid.items())) | ||
modelSettings.addYieldTrend = [False] | ||
modelSettings.dataReduction = ['none'] | ||
################################################################################################################### | ||
|
||
if False : #already tested | ||
b100_load.LoadPredictors_Save_Csv(config, runType) | ||
b100_load.build_features(config, runType) | ||
# remove admin units with missing data in yield !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! | ||
b100_load.LoadLabel_Exclude_Missing(config) | ||
# prepare json files specifying the details of each run to be tested | ||
c100_save_model_specs.save_model_specs(config, modelSettings) | ||
|
||
# print(config.__dict__) | ||
# print(config.sos) | ||
|
||
# get the produced spec file list | ||
spec_files_list = glob.glob(os.path.join(config.models_spec_dir, '*.json')) | ||
for fn in spec_files_list: | ||
with open(fn, 'r') as fp: | ||
uset = json.load(fp) | ||
#print(uset) | ||
hindcaster = d100_modeller.YieldModeller(uset) | ||
# preprocess | ||
X, y, groups, feature_names, AU_codes = hindcaster.preprocess(config, runType) | ||
tic = time.time() | ||
# fit and put results in a dict | ||
hyperParamsGrid, hyperParams, Fit_R2, coefFit, mRes, prctPegged, \ | ||
selected_features_names, prct_selected, n_selected, \ | ||
avg_scoring_metric_on_val = hindcaster.fit(X, y, groups, feature_names, AU_codes) | ||
runTimeH = (time.time() - tic) / (60 * 60) | ||
print(f'Model fitted in {runTimeH} hours') | ||
hindcaster.validate(hyperParamsGrid, hyperParams, Fit_R2, coefFit, mRes, prctPegged, runTimeH, feature_names, selected_features_names, | ||
prct_selected, n_selected, avg_scoring_metric_on_val, config, save_file=True, save_figs=True) | ||
print('ended Algeria_manager') | ||
|
||
|
||
|
||
|
||
print("--- %s seconds ---" % (time.time() - start_time)) | ||
|
||
|
Oops, something went wrong.