PredictionModel/PredictionAndNLP_models.py

# -*- coding: utf-8 -*-
"""CryptoPredictModel.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1LHuZK0S-AWaGC7ijgOQFwlF3FAKSUg1W
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, Dropout, Dense, Activation
from tensorflow.compat.v1.keras.layers import Bidirectional, CuDNNLSTM, Dropout, Dense, Activation
from matplotlib.dates import DateFormatter, AutoDateLocator
import lightgbm as lgb

def directionalAccuracy(Y_train, Y_pred):
    return np.mean(Y_train*Y_pred >= 0)*100

def estimateModelPerformance(model, X, Y):

    global df_analysis

    print("\nPerformance of the Model:\n")
    Y_pred = model.predict(X, num_iteration = model.best_iteration_)

    directional_accuracy = directionalAccuracy(Y, Y_pred)
    r2 = r2_score(Y, Y_pred)
    mse = mean_squared_error(Y, Y_pred)
    correlation = np.corrcoef(Y_pred, Y)[0,1]

    print(f"Mean Squared Error: {mse}")
    print(f"Directional Accuracy: {directional_accuracy}")
    print(f"R^2: {r2}")
    print(f"Correlation: {correlation}")

    df_analysis.loc[0, 'R^2_train'] = r2
    df_analysis.loc[0, 'MSE_Train'] = mse

def optimizeModel(X_train, Y_train, X_val, Y_val, predictionYear, X_train_weight):

     global scorePerYear
     # LGBM Modelling: RANDOMISED GRIDSEARCH . Training LGBM Regressor with Optimal Params

     param_test = {'num_leaves': sp_randint(6, 4000),
                   'min_child_samples': sp_randint(10, 400),
                   'min_child_weight': [1e-1,5e-1,5e-2, 1, 1e1, 1e2],
                   'subsample': sp_uniform(loc=0.3, scale=0.7),
                   'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
                   'reg_alpha': [1e-6,1e-5,1e-4,1e-3, 5e-3, 1e-2, 5e-2, 2e-1],
                   'reg_lambda':[1e-6,1e-5,1e-4,1e-3, 5e-3, 1e-2, 5e-2, 2e-1],
                   'learning_rate' :[1e-6,1e-5,1e-4,1e-3, 5e-3, 1e-2, 5e-2, 2e-1],
                   'max_depth' : [3,4,5,6,7,8,9,10,11,12],
                   }

     fit_params = {#"early_stopping_rounds" :30,
                   'eval_set' : [(X_val,Y_val)],
                   'eval_names': ['valid'],
                   'verbose': 5000,
                   'categorical_feature': ['SECTOR'],
                   'sample_weight' : X_train_weight
                   }

     tscv = TimeSeriesSplit(n_splits=5, test_size=int(len(X_train)/6) , gap=1)

     clf = lgb.LGBMRegressor(max_depth=-1, random_state=314, silent=True, metric='mse',
                             n_jobs= -1, n_estimators= 1500,boosting_type='dart')

     gs = RandomizedSearchCV(estimator=clf, param_distributions=param_test, n_iter=15, cv=tscv,
                             random_state=314, verbose=True, refit = True, return_train_score=True)

     gs.fit(X_train, Y_train, **fit_params)

    #plot_grid_search(gs.cv_results_, 'n_estimators','reg_alpha', 'N Estimators', 'Max Features')
    #fig = plot_cv_results(gs.cv_results_, 'n_estimators', 'reg_alpha')
    #fig.show()
    #fig.savefig('example.png')

     #print(f'Best score reached: {gs.best_score_} with params: {gs.best_params_}')
     scorePerYear.append([gs.best_score_ , gs.cv_results_])
     #print(f'Results: {gs.cv_results_}')

     opt_params = gs.best_params_

     return opt_params, fit_params

"""
This function calculates a lot of metrics. It updates the df_analysis,  df_combine_top
and df_combine_bottom dataframe. Stores all these results in 'nav.xlsx'.

Metrics calculated:
1. Within +-50%
2. Rank Error

3.1 Common top
3.2 Common bottom

4.1 NAV Top Actual
4.2 Nav Top Predicted
4.3 NAV Bottom Actual
4.4 Nav Bottom Predicted
"""

def nav(df_pred, ascending, sheetname, year):

  global count_top
  global count_bottom
  global df_combine_top
  global df_combine_bottom
  global df_analysis

  predicted = 0
  count_50 = 0
  count_rank = 0

  df = df_pred.copy()
  df = df[df['DATE'].dt.month == 1]

  # Calculating 'Within +-50%' metric
  val1 = df['NEXT_PX_PRED'].tolist()
  val2 = df['NEXT_PX'].tolist()

  for i in range(len(val1)):
    val = abs(val1[i] - val2[i])
    if val < (abs(val2[i])/2):
      count_50 += 1

  if ascending == False:
    result = count_50/len(val1)
    df_analysis.loc[0, 'Within_+-50%'] = result

    print(f'\nWithin +-50% of actual return: {result}')

  # Sorting based on actual 'px' and 'predicted px'
  df_px_pred = df.copy()
  df_px = df.copy()

  df_px_pred = df_px_pred.sort_values(by='NEXT_PX_PRED', ascending=ascending)
  df_px = df_px.sort_values(by='NEXT_PX', ascending=ascending)

  stocks_px_pred = df_px_pred['STOCK_NAME'].tolist()
  stocks_px = df_px['STOCK_NAME'].tolist()

  # Calculating 'rank error'

  for i in range(len(stocks_px_pred)):
    count_rank += abs(i - stocks_px.index(stocks_px_pred[i]))

  if ascending == False:
    result = count_rank/len(stocks_px_pred)
    df_analysis.loc[0, 'rank_error'] = result

    print(f'Avg_rank_difference: {result}')

  # Keeping first 30 stocks from both dataframes
  df_px_pred = df_px_pred.iloc[:30,:]
  df_px = df_px.iloc[:30,:]

  stocks_px_pred = df_px_pred['STOCK_NAME'].tolist()
  stocks_px = df_px['STOCK_NAME'].tolist()

  # Using allPrices DataFrame (SPX_Prices.xlsx)

  dates = allPrices['DATE'].tolist()

  allPrices_px_pred = pd.DataFrame({'DATE' : dates})
  allPrices_px = pd.DataFrame({'DATE' : dates})

  # Just making 2 new allPrices dataframes with ordering as per 'df_px_pred' and 'df_px' with 30 stocks only

  for i in range(len(stocks_px_pred)):
    allPrices_px_pred[stocks_px_pred[i]] = allPrices[stocks_px_pred[i]].tolist()

  for i in range(len(stocks_px)):
    allPrices_px[stocks_px[i]] = allPrices[stocks_px[i]].tolist()

  # Getting unique stock names with ordering as per 'df_px_pred' and 'df_px'. Converting to sets to find intersection.

  stocks_px_pred = set(stocks_px_pred)
  stocks_px = set(stocks_px)

  if ascending == False:
    common_top = len(stocks_px_pred.intersection(stocks_px))
    df_analysis.loc[0, 'Common_top'] = common_top

    print(f'Common_Stocks_Top : {common_top}')

  if ascending == True:
    common_bottom = len(stocks_px_pred.intersection(stocks_px))
    df_analysis.loc[0, 'Common_bottom'] = common_bottom

    print(f'Common_Stocks_Bottom : {common_bottom}')

  # Taking price data only for 1 year for the 30 stocks
  yearPrice_px_pred = allPrices_px_pred[allPrices_px_pred['DATE'].dt.year == year]

  # Creating another dataframe to calculate some metric for df_combine_bottom and df_combine_top
  df_combine_data = pd.DataFrame({'DATE' : dates})
  df_combine_data = df_combine_data[df_combine_data['DATE'].dt.year == year]

  df_val1 = yearPrice_px_pred.copy()
  df_val2 = yearPrice_px_pred.copy()

  initial = np.array([100/30 for i in range(30)])
  df_val2.iloc[0,1:] = initial

  for i in range(1,len(yearPrice_px_pred)):
    df_val1.iloc[i,1:] = (yearPrice_px_pred.iloc[i,1:] - yearPrice_px_pred.iloc[i-1,1:])/(yearPrice_px_pred.iloc[i-1,1:])
    df_val2.iloc[i,1:] = df_val2.iloc[i-1,1:] + df_val2.iloc[i-1,1:]*df_val1.iloc[i,1:]

  df_val2['sum'] = df_val2.iloc[:,1:].sum(axis = 1)
  predicted = df_val2.iloc[-1,-1]

  if ascending == False:
    df_combine_data['sum'] = df_val2.iloc[:,1:-1].sum(axis = 1)*(count_top/100)
    df_combine_top = pd.concat([df_combine_top, df_combine_data])
    count_top = df_combine_top.iloc[-1,-1]

  if ascending == True:
    df_combine_data['sum'] = df_val2.iloc[:,1:-1].sum(axis = 1)*(count_bottom/100)
    df_combine_bottom = pd.concat([df_combine_bottom, df_combine_data])
    count_bottom = df_combine_bottom.iloc[-1,-1]

  excel_book = pxl.load_workbook(os.path.join(directory, "nav3.xlsx"))

  with pd.ExcelWriter(os.path.join(directory, "nav3.xlsx"), engine='openpyxl') as writer:

    writer.book = excel_book
    writer.sheets = {worksheet.title: worksheet for worksheet in excel_book.worksheets}

    df_val2.to_excel(writer, sheet_name = sheetname, index=False)
    df_combine_top.to_excel(writer, sheet_name = 'combine_top', index=False)
    df_combine_bottom.to_excel(writer, sheet_name = 'combine_bottom', index=False)

    writer.save()

  yearPrice_px = allPrices_px[allPrices_px['DATE'].dt.year == year]
  df_combine_data = df_combine_data[df_combine_data['DATE'].dt.year == year]

  df_val1 = yearPrice_px.copy()
  df_val2 = yearPrice_px.copy()

  df_val2.iloc[0,1:] = (yearPrice_px.iloc[1,1:]*100)/(yearPrice_px.iloc[1,1:]*30)

  for i in range(1,len(yearPrice_px)):
    df_val1.iloc[i,1:] = (yearPrice_px.iloc[i,1:] - yearPrice_px.iloc[i-1,1:])/(yearPrice_px.iloc[i-1,1:])
    df_val2.iloc[i,1:] = df_val2.iloc[i-1,1:] + df_val2.iloc[i-1,1:]*df_val1.iloc[i,1:]

  df_val2['sum'] = df_val2.iloc[:,1:].sum(axis = 1)

  actual = df_val2.iloc[-1,-1]

  if ascending == False:
    print(f'actual_return_top: {actual}')
    print(f'predicted_return_top: {predicted}')

    df_analysis.loc[0, 'NAV_Top_Actual'] = actual
    df_analysis.loc[0, 'NAV_Top_Predicted'] = predicted

  if ascending == True:

    print(f'actual_return_bottom: {actual}')
    print(f'predicted_return_bottom: {predicted}')

    df_analysis.loc[0, 'NAV_Bottom_Actual'] = actual
    df_analysis.loc[0, 'NAV_Bottom_Predicted'] = predicted

def plot_series(x, y, format="-", start=0, end=None,
                title=None, xlabel=None, ylabel=None, legend=None ):
    """
    Visualizes time series data

    Args:
      x (array of int) - contains values for the x-axis
      y (array of int or tuple of arrays) - contains the values for the y-axis
      format (string) - line style when plotting the graph
      start (int) - first time step to plot
      end (int) - last time step to plot
      title (string) - title of the plot
      xlabel (string) - label for the x-axis
      ylabel (string) - label for the y-axis
      legend (list of strings) - legend for the plot
    """

    # Setup dimensions of the graph figure
    plt.figure(figsize=(10, 6))

    # Check if there are more than two series to plot
    if type(y) is tuple:

      # Loop over the y elements
      for y_curr in y:

        # Plot the x and current y values
        plt.plot(x[start:end], y_curr[start:end], format)

    else:
      # Plot the x and y values
      plt.plot(x[start:end], y[start:end], format)

    # Label the x-axis
    plt.xlabel(xlabel)

    # Label the y-axis
    plt.ylabel(ylabel)

    # Set the legend
    if legend:
      plt.legend(legend)

    # Set the title
    plt.title(title)

    # Overlay a grid on the graph
    plt.grid(True)

    # Draw the graph on screen
    plt.show()

def get_data(file_name):
  df = pd.read_csv(file_name)
  df = df[::-1].reset_index(drop=True)
  df['Price'] = df['Price'].str.replace(',', '').astype(float)
  return df

def split_data_set(df, ratio):
  split_point = int((len(df)) * ratio)
  print(split_point)
  time_train = df['Date'][:split_point]
  x_train = df['Price'][:split_point]

  time_valid = df['Date'][split_point:]
  x_valid = df['Price'][split_point:]

  return time_train, x_train, time_valid, x_valid

def windowed_dataset(series, window_size, batch_size, shuffle_buffer):
    """Generates dataset windows

    Args:
      series (array of float) - contains the values of the time series
      window_size (int) - the number of time steps to include in the feature
      batch_size (int) - the batch size
      shuffle_buffer(int) - buffer size to use for the shuffle method

    Returns:
      dataset (TF Dataset) - TF Dataset containing time windows
    """

    # Generate a TF Dataset from the series values
    dataset = tf.data.Dataset.from_tensor_slices(series)

    # Window the data but only take those with the specified size
    dataset = dataset.window(window_size + 1, shift=1, drop_remainder=True)

    # Flatten the windows by putting its elements in a single batch
    dataset = dataset.flat_map(lambda window: window.batch(window_size + 1))

    # Create tuples with features and labels
    dataset = dataset.map(lambda window: (window[:-1], window[-1]))

    # Shuffle the windows
    dataset = dataset.shuffle(shuffle_buffer)

    # Create batches of windows
    dataset = dataset.batch(batch_size).prefetch(1)

    return dataset

def add_features(df):
  window = 10
  df['Price_MA'] = df['Price'].rolling(window=window_size).mean()
  df['Price_1Lag'] = df['Price'].shift(1)
  df['Price_2Lag'] = df['Price'].shift(2)
  # df.fillna(0, inplace=True)
  df.dropna(inplace=True)
  return df

# Parameters

window_size = 3
batch_size = 32
shuffle_buffer_size = 1000

def get_train_set(x_train):

  # Generate the dataset windows
  # train_set = windowed_dataset(x_train, window_size, batch_size, shuffle_buffer_size)
  train_set = add_features(train_set)
  return train_set

# Build the Model

def build_model(window_size):
  model = tf.keras.models.Sequential([
  tf.keras.layers.Conv1D(filters=64, kernel_size=3,
                      strides=1,
                      activation="relu",
                      padding='causal',
                      input_shape=[window_size, 1]),
  tf.keras.layers.LSTM(64, return_sequences=True),
  tf.keras.layers.LSTM(64),
  tf.keras.layers.Dense(30, activation="relu"),
  tf.keras.layers.Dense(10, activation="relu"),
  tf.keras.layers.Dense(1),
  tf.keras.layers.Lambda(lambda x: x * 400)
])

 # Print the model summary
  model.summary()
  return model

def build_model_lgbm():
  params = {
    'objective': 'regression',
    'metric': 'rmse',  # Root Mean Squared Error
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
  }
  lgb_model = lgb.LGBMRegressor(**params, n_estimators=500)
  return lgb_model

def tune_model(model, train_set):
  init_weights = model.get_weights()
  lr_schedule = tf.keras.callbacks.LearningRateScheduler(
    lambda epoch: 1e-8 * 10**(epoch / 20))

  # Initialize the optimizer
  optimizer = tf.keras.optimizers.SGD(momentum=0.9)

  # Set the training parameters
  model.compile(loss=tf.keras.losses.Huber(), optimizer=optimizer)

  # Train the model
  history = model.fit(train_set, epochs=100, callbacks=[lr_schedule])

  # Define the learning rate array
  lrs = 1e-8 * (10 ** (np.arange(100) / 20))

  # Set the figure size
  plt.figure(figsize=(10, 6))

  # Set the grid
  plt.grid(True)

  # Plot the loss in log scale
  plt.semilogx(lrs, history.history["loss"])

  # Increase the tickmarks size
  plt.tick_params('both', length=10, width=1, which='both')

  # Set the plot boundaries
  plt.axis([1e-8, 1e-3, 0, 100])

  # Reset states generated by Keras
  tf.keras.backend.clear_session()

  # Reset the weights
  model.set_weights(init_weights)

  # Set the learning rate
  learning_rate = 8e-7

  # Set the optimizer
  optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate, momentum=0.9)

  # Set the training parameters
  model.compile(loss=tf.keras.losses.Huber(),
                optimizer=optimizer,
                metrics=["mae"])
  return model

# Train the model

def run_model(model, train_set):
  history = model.fit(train_set,epochs=100)

  # Get mae and loss from history log
  mae=history.history['mae']
  loss=history.history['loss']

  # Get number of epochs
  epochs=range(len(loss))

  # Plot mae and loss
  plot_series(
      x=epochs,
      y=(mae, loss),
      title='MAE and Loss',
      xlabel='MAE',
      ylabel='Loss',
      legend=['MAE', 'Loss']
      )

  # Only plot the last 80% of the epochs
  zoom_split = int(epochs[-1] * 0.2)
  epochs_zoom = epochs[zoom_split:]
  mae_zoom = mae[zoom_split:]
  loss_zoom = loss[zoom_split:]

  # Plot zoomed mae and loss
  plot_series(
      x=epochs_zoom,
      y=(mae_zoom, loss_zoom),
      title='MAE and Loss',
      xlabel='MAE',
      ylabel='Loss',
      legend=['MAE', 'Loss']
      )

def model_forecast(model, series, window_size, batch_size):
    """Uses an input model to generate predictions on data windows

    Args:
      model (TF Keras Model) - model that accepts data windows
      series (array of float) - contains the values of the time series
      window_size (int) - the number of time steps to include in the window
      batch_size (int) - the batch size

    Returns:
      forecast (numpy array) - array containing predictions
    """

    # Generate a TF Dataset from the series values
    dataset = tf.data.Dataset.from_tensor_slices(series)

    # Window the data but only take those with the specified size
    dataset = dataset.window(window_size, shift=1, drop_remainder=True)

    # Flatten the windows by putting its elements in a single batch
    dataset = dataset.flat_map(lambda w: w.batch(window_size))

    # Create batches of windows
    dataset = dataset.batch(batch_size).prefetch(1)

    # Get predictions on the entire dataset
    forecast = model.predict(dataset)

    return forecast

def model_forecast_runner(df, ratio, time_valid, x_valid):
  split_point = int((len(df)) * ratio)
  series = df['Price']
  forecast_series = series[split_point-window_size:-1]

  forecast = model_forecast(model, forecast_series, window_size, batch_size)
  # Drop single dimensional axis
  results = forecast.squeeze()
  print(results)

  # Plot the results
  plot_series(time_valid, (x_valid, results))
  print(tf.keras.metrics.mean_absolute_error(x_valid, results).numpy())

"""### Functions LGBM Model"""

def train_test_split(df):
  split_point = (int(len(df) * 0.75))
  df_new = df.drop(['Date', 'Open', 'High', 'Low', 'Vol.', 'Change %'], axis=1)
  df_train = df_new[:split_point]
  df_test = df_new[split_point:]
  x_train = df_train.drop('Price', axis=1)
  y_train = df_train['Price']
  x_test = df_test.drop('Price', axis=1)
  y_test = df_test['Price']
  return x_train, y_train, x_test, y_test

"""# Testing the Model"""

df = get_data("Ethereum Historical Data - Investing.com India.csv")

plot_series(x=df.index, y=df['Price'], start=0)

df

time_train, x_train, time_valid, x_valid = split_data_set(df, 0.75)

train_set = get_train_set(x_train)

model = build_model(30)

model = tune_model(model, train_set)

run_model(model, train_set)

model_forecast_runner(df, 0.75, time_valid, x_valid)


"""### LGBM Model test Run"""

model = build_model_lgbm()

df_added = add_features(df)

df_added

x_train, y_train, x_test, y_test = train_test_split(df_added)

x_train

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

plt.plot(y_test.index, y_test)
plt.plot(y_test.index, y_pred)
plt.show()


"""NLP to analyse the input"""

import nltk
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt

# Download NLTK resources if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')

# Read the text file
with open('your_text_file.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# Tokenize the text
tokens = word_tokenize(text.lower())  # Convert to lowercase for case-insensitive analysis

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]

# Calculate word frequencies
freq_dist = FreqDist(filtered_tokens)

# Display the most common words and their frequencies
print("Most common words and their frequencies:")
print(freq_dist.most_common(10))

# Plot the word frequency distribution
plt.figure(figsize=(10, 5))
freq_dist.plot(30, cumulative=False)
plt.title('Word Frequency Distribution')
plt.show()

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming you have a DataFrame named 'df' with 'text' and 'label' columns
# 'text' column contains the text data, and 'label' column contains the class labels

# Read or load your data
# df = pd.read_csv('your_data.csv')

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF vectorization and Multinomial Naive Bayes
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("\nConfusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(class_report)