lambda-science
diff --git a/‎.gitignore
Lines changed: 7 additions & 0 deletions b/‎.gitignore
Lines changed: 7 additions & 0 deletions
diff --git a/‎Home.py
Lines changed: 2 additions & 1 deletion b/‎Home.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎models/instructor_en_model_rf.joblib
208 KB b/‎models/instructor_en_model_rf.joblib
208 KB
diff --git a/‎models/instructor_fr_model_rf.joblib
301 KB b/‎models/instructor_fr_model_rf.joblib
301 KB
diff --git a/‎models/instructor_model.joblib
-3.55 MB b/‎models/instructor_model.joblib
-3.55 MB
diff --git a/‎models/openai_en_model_rf.joblib
402 KB b/‎models/openai_en_model_rf.joblib
402 KB
diff --git a/‎models/openai_fr_model_rf.joblib
240 KB b/‎models/openai_fr_model_rf.joblib
240 KB
diff --git a/‎pages/2_📝_MyoExtract.py
Lines changed: 1 addition & 4 deletions b/‎pages/2_📝_MyoExtract.py
Lines changed: 1 addition & 4 deletions
diff --git a/‎pages/3_🪄_MyoClassify.py
Lines changed: 69 additions & 7 deletions b/‎pages/3_🪄_MyoClassify.py
Lines changed: 69 additions & 7 deletions
diff --git a/‎train_ingest/myoclassify_full_train.py
Lines changed: 213 additions & 0 deletions b/‎train_ingest/myoclassify_full_train.py
Lines changed: 213 additions & 0 deletions
@@ -178,3 +178,10 @@ models/vicuna*
 data_old
 /data
 /db_myocon
+.nfs*
+*.zip
+models/backup
+nohup.out
+results/*
+models/*gridsearch*
+models/*_mlpc*
@@ -23,8 +23,9 @@
 - **Anonymizer 🕵️**: a simple web-based tool to automatically censor patient histology report PDF.
 - **MyoExtract 📝:** a tool to extract metadata from histology reports such as biopsy number, muscle, diagnosis...
 - **MyoClassify 🪄:** a tool to automatically predict a diagnosis of congenital myopathy subtype from an histology reports using AI (large language models). Currently can predict between: Nemaline Myopathy, Core Myopathy, Centro-nuclear Myopathy, Non Congenital Myopathy (NON-MC).
-- **MyoSearch 🔎:** a tool to search for a specific term in a set of histology reports. The tool will return the top 5 reports containing closest to your symptom query from our database of reports..
+- **MyoSearch 🔎:** a tool to search for a specific term in a set of histology reports. The tool will return the top 5 reports containing closest to your symptom query from our database of reports.  
 
+🚨 DISCLAIMER: If you choose OpenAI instead of private AI in tools options, some tools will use [OpenAI API](https://openai.com/). Data will be sent to OpenAI servers. If using OpenAI Model, do not upload private or non-anonymized data. As per their terms of service [OpenAI does not retain any data  (for more time than legal requirements, click for source) and do not use them for trainning.](https://openai.com/policies/api-data-usage-policies) However, we do not take any responsibility for any data leak.    
 ## Contact
 Creator and Maintainer: [**Corentin Meyer**, 3rd year PhD Student in the CSTB Team, ICube — CNRS — Unistra](https://lambda-science.github.io/)  <[email protected]>  
 The source code for NLMyo is available [HERE](https://github.com/lambda-science/NLMyo)
 
@@ -94,10 +94,7 @@ def st_analyze_pdf(uploaded_file, lang):
 st.markdown(
     """
 ### MyoExtract 📝 is a simple web-based tool to automatically extract common metadata from patient histology report PDF to a JSON format.  
-Upload a single PDF file or copy paste your text-report and the tool will automatically find for your all: complete name, age, birth date, biopsy date, biopsy sending date, muscle, biopsy number, diagnosis, presence of anomaly in PAS staining, presence of anomaly in Soudan Staining, presence of anomaly in COX staining, presence of anomaly in ATP staining,  presence of anomaly in Phosrylase staining.
-
-🚨 DISCLAIMER: If you choose OpenAI on the left, this tool will use [OpenAI API](https://openai.com/). Data will be sent to OpenAI servers. If using OpenAI Model, do not upload private or non-anonymized data. As per their terms of service [OpenAI does not retain any data  (for more time than legal requirements, click for source) and do not use them for trainning.](https://openai.com/policies/api-data-usage-policies) However, we do not take any responsibility for any data leak.    
-Creator and Maintainer: [**Corentin Meyer**, 3rd year PhD Student in the CSTB Team, ICube — CNRS — Unistra](https://lambda-science.github.io/)  <[email protected]>  
+Upload a single PDF file or copy paste your text-report and the tool will automatically find for your all: complete name, age, birth date, biopsy date, biopsy sending date, muscle, biopsy number, diagnosis, presence of anomaly in PAS staining, presence of anomaly in Soudan Staining, presence of anomaly in COX staining, presence of anomaly in ATP staining,  presence of anomaly in Phosrylase staining.  
 """
 )
 
 
@@ -4,7 +4,12 @@
 import numpy as np
 import sys
 from streamlit.components.v1 import html
-from langchain.embeddings import HuggingFaceInstructEmbeddings
+from langchain.embeddings import HuggingFaceInstructEmbeddings, OpenAIEmbeddings
+from dotenv import load_dotenv
+import openai
+
+load_dotenv()
+openai.api_key = os.getenv("OPENAI_API_KEY")
 
 sys.path.append("../")
 from src import TextReport
@@ -37,6 +42,12 @@ def embed_text(text):
     return results
 
 
+@st.cache_data()
+def embed_text_openai(text):
+    results = openai.Embedding.create(model="text-embedding-ada-002", input=text)
+    return results
+
+
 @st.cache_data()
 def st_analyze_pdf(uploaded_file, lang):
     pdf_object = TextReport(uploaded_file, lang=lang)
@@ -47,9 +58,12 @@ def st_analyze_pdf(uploaded_file, lang):
 with st.sidebar:
     st.write("Report Language")
     lang = st.selectbox("Select Language", ("fra", "eng"))
+    mode = st.selectbox("Select Mode", ("Instructor", "openAI"))
 
-loaded_model = joblib.load("models/instructor_model.joblib")
-label_dict = {i: label for i, label in enumerate(loaded_model.classes_)}
+loaded_model_instructor_fr = joblib.load("models/instructor_fr_model_rf.joblib")
+loaded_model_instructor_en = joblib.load("models/instructor_en_model_rf.joblib")
+loaded_model_openai_fr = joblib.load("models/openai_fr_model_rf.joblib")
+loaded_model_openai_en = joblib.load("models/openai_en_model_rf.joblib")
 
 
 st.write("# MyoClassify🪄")
@@ -88,12 +102,60 @@ def st_analyze_pdf(uploaded_file, lang):
         st.write("## Raw text")
         st.write(raw_text)
     st.markdown("# Most probable diagnosis")
-    results = embed_text(input_text)
-    embedding = np.array(results)
-    prediction = loaded_model.predict(embedding.reshape(1, -1))
+
+    if lang == "fra":
+        if mode == "Instructor":
+            results = embed_text(input_text)
+            embedding_features = np.array(results)
+            prediction = loaded_model_instructor_fr.predict(
+                embedding_features.reshape(1, -1)
+            )
+            confidence = loaded_model_instructor_fr.predict_proba(
+                embedding_features.reshape(1, -1)
+            )
+            label_dict = {
+                i: label for i, label in enumerate(loaded_model_instructor_fr.classes_)
+            }
+        elif mode == "openAI":
+            results = embed_text_openai(input_text)
+            embedding_features = np.array(results["data"][0]["embedding"])
+            prediction = loaded_model_openai_fr.predict(
+                embedding_features.reshape(1, -1)
+            )
+            confidence = loaded_model_openai_fr.predict_proba(
+                embedding_features.reshape(1, -1)
+            )
+            label_dict = {
+                i: label for i, label in enumerate(loaded_model_openai_fr.classes_)
+            }
+    elif lang == "eng":
+        if mode == "Instructor":
+            results = embed_text(input_text)
+            embedding_features = np.array(results)
+            prediction = loaded_model_instructor_en.predict(
+                embedding_features.reshape(1, -1)
+            )
+            confidence = loaded_model_instructor_en.predict_proba(
+                embedding_features.reshape(1, -1)
+            )
+            label_dict = {
+                i: label for i, label in enumerate(loaded_model_instructor_en.classes_)
+            }
+        elif mode == "openAI":
+            results = embed_text_openai(input_text)
+            embedding_features = np.array(results["data"][0]["embedding"])
+            prediction = loaded_model_openai_en.predict(
+                embedding_features.reshape(1, -1)
+            )
+            confidence = loaded_model_openai_en.predict_proba(
+                embedding_features.reshape(1, -1)
+            )
+            label_dict = {
+                i: label for i, label in enumerate(loaded_model_openai_en.classes_)
+            }
+
     st.write("Prediction: ", prediction[0])
     st.markdown("# Probability of each diagnosis")
-    confidence = loaded_model.predict_proba(embedding.reshape(1, -1))
     for index, value in enumerate(confidence[0]):
         st.write(f"Confidence score for:  {label_dict[index]}: {round(value*100)}% ")
 
 
@@ -0,0 +1,213 @@
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import StratifiedKFold
+import joblib
+from dotenv import load_dotenv
+from sklearn.metrics import classification_report, balanced_accuracy_score
+from sklearn.model_selection import cross_val_predict
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.neural_network import MLPClassifier
+from sklearn.model_selection import GridSearchCV
+import pandas as pd
+import wandb
+
+load_dotenv()
+
+#### Import the data
+df = pd.read_csv("../data/text_dataset_translate.csv")
+Y = df["diag"].values
+
+# Remove CFTD and unclear diagnosis
+df["diag"].value_counts()
+# Drop the rows with unclear diagnosis
+df = df[df["diag"] != "UNCLEAR"]
+# Do the same for the X array based on the df index
+
+cv_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
+df["diag"].value_counts()
+
+
+LANGUAGE = ["fr", "en"]
+EMBEDDING_MODEL = ["instructor", "openai"]
+
+Y = df["diag"].values
+
+for lang in LANGUAGE:
+    for embedding_method in EMBEDDING_MODEL:
+        X = np.load(f"../data/embeddings/{embedding_method}_{lang}_embeddings.npy")
+
+        #########################################
+        # MLPC
+        param_grid = {
+            "hidden_layer_sizes": [(400,), (200,), (100, 100), (200, 200)],
+            "activation": ["tanh", "relu"],
+            "solver": ["adam"],
+            "learning_rate_init": [0.001, 0.01],
+            "max_iter": [800, 1500, 2500],
+        }
+
+        # Create grid search
+        cls = MLPClassifier(random_state=42)
+        gs_mlpc = GridSearchCV(
+            cls, param_grid, scoring="accuracy", cv=cv_fold, verbose=1
+        )
+        gs_mlpc.fit(X, Y)
+        best_mlpc = gs_mlpc.best_estimator_
+        df_cv_search_rf = pd.DataFrame(gs_mlpc.cv_results_)
+        # Print the best parameters and score
+        print("Best parameters:", gs_mlpc.best_params_)
+        print("Best score:", gs_mlpc.best_score_)
+        joblib.dump(
+            gs_mlpc, f"../models/{embedding_method}_{lang}_gridsearch_mlpc.joblib"
+        )
+        joblib.dump(best_mlpc, f"../models/{embedding_method}_{lang}_model_mlpc.joblib")
+
+        gs_mlpc = joblib.load(
+            f"../models/{embedding_method}_{lang}_gridsearch_mlpc.joblib"
+        )
+        best_mlpc = joblib.load(
+            f"../models/{embedding_method}_{lang}_model_mlpc.joblib"
+        )
+
+        # Use cross_val_predict to get predicted labels and probabilities
+        y_pred = cross_val_predict(best_mlpc, X, Y, cv=cv_fold)
+        y_probas = cross_val_predict(
+            best_mlpc, X, Y, cv=cv_fold, method="predict_proba"
+        )
+        # Compute classification report
+        report = classification_report(
+            Y, y_pred, target_names=best_mlpc.classes_, output_dict=True
+        )
+
+        run = wandb.init(
+            project="myo-text-classify",
+            name=f"{embedding_method}_{lang}_mlpc",
+            config={
+                "embedding": f"{embedding_method}",
+                "doc_lang": f"{lang}",
+                "corpus": "complete_1704023_190reports",
+                "model": "MLPClassifier",
+            },
+        )
+        config = wandb.config
+        best_params = gs_mlpc.best_params_
+        best_score = gs_mlpc.best_score_
+        best_std = gs_mlpc.cv_results_["std_test_score"][gs_mlpc.best_index_]
+        balanced_accuracy_metric = balanced_accuracy_score(Y, y_pred)
+
+        wandb.log(
+            {
+                "Classification Report": report,
+                "Best Params": best_params,
+                "Best Score (gs)": best_score,
+                "CV Std Devs (gs)": best_std,
+                "Balanced Accuracy": balanced_accuracy_metric,
+            }
+        )
+        wandb.sklearn.plot_confusion_matrix(Y, y_pred, best_mlpc.classes_)
+        wandb.sklearn.plot_classifier(
+            best_mlpc,
+            X,
+            X,
+            Y,
+            Y,
+            y_pred,
+            y_probas,
+            labels=best_mlpc.classes_,
+            model_name=f"{embedding_method}_{lang}_model",
+            feature_names=None,
+        )
+        # Create artifact for best model
+        model_artifact = wandb.Artifact(
+            f"{embedding_method}_{lang}_model_mlpc", type="model"
+        )
+        # Add best estimator to artifact
+        model_artifact.add_file(
+            f"../models/{embedding_method}_{lang}_model_mlpc.joblib"
+        )
+        # Log artifact to WandB
+        wandb.run.log_artifact(model_artifact)
+        wandb.finish()
+
+        #############################################
+        # RANDOM FOREST
+        param_grid_rf = {
+            "n_estimators": [10, 50, 100, 200],
+            "max_depth": [None, 5, 10, 20],
+            "min_samples_split": [2, 5, 10],
+            "min_samples_leaf": [1, 2, 4],
+            "class_weight": ["balanced", "balanced_subsample"],
+        }
+
+        # Create grid search
+        cls_rf = RandomForestClassifier(random_state=42)
+        gs_rf = GridSearchCV(
+            cls_rf, param_grid_rf, scoring="accuracy", cv=cv_fold, verbose=1
+        )
+        gs_rf.fit(X, Y)
+        best_rf = gs_rf.best_estimator_
+        df_cv_search_rf = pd.DataFrame(gs_rf.cv_results_)
+        # Print the best parameters and score
+        print("Best parameters:", gs_rf.best_params_)
+        print("Best score:", gs_rf.best_score_)
+        joblib.dump(gs_rf, f"../models/{embedding_method}_{lang}_gridsearch_rf.joblib")
+        joblib.dump(best_rf, f"../models/{embedding_method}_{lang}_model_rf.joblib")
+
+        gs_rf = joblib.load(f"../models/{embedding_method}_{lang}_gridsearch_rf.joblib")
+        best_rf = joblib.load(f"../models/{embedding_method}_{lang}_model_rf.joblib")
+
+        # Use cross_val_predict to get predicted labels and probabilities
+        y_pred = cross_val_predict(best_rf, X, Y, cv=cv_fold)
+        y_probas = cross_val_predict(best_rf, X, Y, cv=cv_fold, method="predict_proba")
+        # Compute classification report
+        report = classification_report(
+            Y, y_pred, target_names=best_rf.classes_, output_dict=True
+        )
+
+        run = wandb.init(
+            project="myo-text-classify",
+            name=f"{embedding_method}_{lang}_rf",
+            config={
+                "embedding": f"{embedding_method}",
+                "doc_lang": f"{lang}",
+                "corpus": "complete_1704023_190reports",
+                "model": "RandomForest",
+            },
+        )
+        config = wandb.config
+        best_params = gs_mlpc.best_params_
+        best_score = gs_mlpc.best_score_
+        best_std = gs_mlpc.cv_results_["std_test_score"][gs_mlpc.best_index_]
+        balanced_accuracy_metric = balanced_accuracy_score(Y, y_pred)
+
+        wandb.log(
+            {
+                "Classification Report": report,
+                "Best Params": best_params,
+                "Best Score (gs)": best_score,
+                "CV Std Devs (gs)": best_std,
+                "Balanced Accuracy": balanced_accuracy_metric,
+            }
+        )
+        wandb.sklearn.plot_confusion_matrix(Y, y_pred, best_rf.classes_)
+        wandb.sklearn.plot_classifier(
+            best_rf,
+            X,
+            X,
+            Y,
+            Y,
+            y_pred,
+            y_probas,
+            labels=best_rf.classes_,
+            model_name=f"{embedding_method}_{lang}_model",
+            feature_names=None,
+        )
+        # Create artifact for best model
+        model_artifact = wandb.Artifact(
+            f"{embedding_method}_{lang}_model_rf", type="model"
+        )
+        # Add best estimator to artifact
+        model_artifact.add_file(f"../models/{embedding_method}_{lang}_model_rf.joblib")
+        # Log artifact to WandB
+        wandb.run.log_artifact(model_artifact)
+        wandb.finish()