Skip to content

Commit 9618065

Browse files
new models options
1 parent c4f2025 commit 9618065

11 files changed

+342
-19
lines changed

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,3 +178,10 @@ models/vicuna*
178178
data_old
179179
/data
180180
/db_myocon
181+
.nfs*
182+
*.zip
183+
models/backup
184+
nohup.out
185+
results/*
186+
models/*gridsearch*
187+
models/*_mlpc*

Home.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,9 @@
2323
- **Anonymizer 🕵️**: a simple web-based tool to automatically censor patient histology report PDF.
2424
- **MyoExtract 📝:** a tool to extract metadata from histology reports such as biopsy number, muscle, diagnosis...
2525
- **MyoClassify 🪄:** a tool to automatically predict a diagnosis of congenital myopathy subtype from an histology reports using AI (large language models). Currently can predict between: Nemaline Myopathy, Core Myopathy, Centro-nuclear Myopathy, Non Congenital Myopathy (NON-MC).
26-
- **MyoSearch 🔎:** a tool to search for a specific term in a set of histology reports. The tool will return the top 5 reports containing closest to your symptom query from our database of reports..
26+
- **MyoSearch 🔎:** a tool to search for a specific term in a set of histology reports. The tool will return the top 5 reports containing closest to your symptom query from our database of reports.
2727
28+
🚨 DISCLAIMER: If you choose OpenAI instead of private AI in tools options, some tools will use [OpenAI API](https://openai.com/). Data will be sent to OpenAI servers. If using OpenAI Model, do not upload private or non-anonymized data. As per their terms of service [OpenAI does not retain any data (for more time than legal requirements, click for source) and do not use them for trainning.](https://openai.com/policies/api-data-usage-policies) However, we do not take any responsibility for any data leak.
2829
## Contact
2930
Creator and Maintainer: [**Corentin Meyer**, 3rd year PhD Student in the CSTB Team, ICube — CNRS — Unistra](https://lambda-science.github.io/) <[email protected]>
3031
The source code for NLMyo is available [HERE](https://github.com/lambda-science/NLMyo)

models/instructor_en_model_rf.joblib

208 KB
Binary file not shown.

models/instructor_fr_model_rf.joblib

301 KB
Binary file not shown.

models/instructor_model.joblib

-3.55 MB
Binary file not shown.

models/openai_en_model_rf.joblib

402 KB
Binary file not shown.

models/openai_fr_model_rf.joblib

240 KB
Binary file not shown.

pages/2_📝_MyoExtract.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,7 @@ def st_analyze_pdf(uploaded_file, lang):
9494
st.markdown(
9595
"""
9696
### MyoExtract 📝 is a simple web-based tool to automatically extract common metadata from patient histology report PDF to a JSON format.
97-
Upload a single PDF file or copy paste your text-report and the tool will automatically find for your all: complete name, age, birth date, biopsy date, biopsy sending date, muscle, biopsy number, diagnosis, presence of anomaly in PAS staining, presence of anomaly in Soudan Staining, presence of anomaly in COX staining, presence of anomaly in ATP staining, presence of anomaly in Phosrylase staining.
98-
99-
🚨 DISCLAIMER: If you choose OpenAI on the left, this tool will use [OpenAI API](https://openai.com/). Data will be sent to OpenAI servers. If using OpenAI Model, do not upload private or non-anonymized data. As per their terms of service [OpenAI does not retain any data (for more time than legal requirements, click for source) and do not use them for trainning.](https://openai.com/policies/api-data-usage-policies) However, we do not take any responsibility for any data leak.
100-
Creator and Maintainer: [**Corentin Meyer**, 3rd year PhD Student in the CSTB Team, ICube — CNRS — Unistra](https://lambda-science.github.io/) <[email protected]>
97+
Upload a single PDF file or copy paste your text-report and the tool will automatically find for your all: complete name, age, birth date, biopsy date, biopsy sending date, muscle, biopsy number, diagnosis, presence of anomaly in PAS staining, presence of anomaly in Soudan Staining, presence of anomaly in COX staining, presence of anomaly in ATP staining, presence of anomaly in Phosrylase staining.
10198
"""
10299
)
103100

pages/3_🪄_MyoClassify.py

Lines changed: 69 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,12 @@
44
import numpy as np
55
import sys
66
from streamlit.components.v1 import html
7-
from langchain.embeddings import HuggingFaceInstructEmbeddings
7+
from langchain.embeddings import HuggingFaceInstructEmbeddings, OpenAIEmbeddings
8+
from dotenv import load_dotenv
9+
import openai
10+
11+
load_dotenv()
12+
openai.api_key = os.getenv("OPENAI_API_KEY")
813

914
sys.path.append("../")
1015
from src import TextReport
@@ -37,6 +42,12 @@ def embed_text(text):
3742
return results
3843

3944

45+
@st.cache_data()
46+
def embed_text_openai(text):
47+
results = openai.Embedding.create(model="text-embedding-ada-002", input=text)
48+
return results
49+
50+
4051
@st.cache_data()
4152
def st_analyze_pdf(uploaded_file, lang):
4253
pdf_object = TextReport(uploaded_file, lang=lang)
@@ -47,9 +58,12 @@ def st_analyze_pdf(uploaded_file, lang):
4758
with st.sidebar:
4859
st.write("Report Language")
4960
lang = st.selectbox("Select Language", ("fra", "eng"))
61+
mode = st.selectbox("Select Mode", ("Instructor", "openAI"))
5062

51-
loaded_model = joblib.load("models/instructor_model.joblib")
52-
label_dict = {i: label for i, label in enumerate(loaded_model.classes_)}
63+
loaded_model_instructor_fr = joblib.load("models/instructor_fr_model_rf.joblib")
64+
loaded_model_instructor_en = joblib.load("models/instructor_en_model_rf.joblib")
65+
loaded_model_openai_fr = joblib.load("models/openai_fr_model_rf.joblib")
66+
loaded_model_openai_en = joblib.load("models/openai_en_model_rf.joblib")
5367

5468

5569
st.write("# MyoClassify🪄")
@@ -88,12 +102,60 @@ def st_analyze_pdf(uploaded_file, lang):
88102
st.write("## Raw text")
89103
st.write(raw_text)
90104
st.markdown("# Most probable diagnosis")
91-
results = embed_text(input_text)
92-
embedding = np.array(results)
93-
prediction = loaded_model.predict(embedding.reshape(1, -1))
105+
106+
if lang == "fra":
107+
if mode == "Instructor":
108+
results = embed_text(input_text)
109+
embedding_features = np.array(results)
110+
prediction = loaded_model_instructor_fr.predict(
111+
embedding_features.reshape(1, -1)
112+
)
113+
confidence = loaded_model_instructor_fr.predict_proba(
114+
embedding_features.reshape(1, -1)
115+
)
116+
label_dict = {
117+
i: label for i, label in enumerate(loaded_model_instructor_fr.classes_)
118+
}
119+
elif mode == "openAI":
120+
results = embed_text_openai(input_text)
121+
embedding_features = np.array(results["data"][0]["embedding"])
122+
prediction = loaded_model_openai_fr.predict(
123+
embedding_features.reshape(1, -1)
124+
)
125+
confidence = loaded_model_openai_fr.predict_proba(
126+
embedding_features.reshape(1, -1)
127+
)
128+
label_dict = {
129+
i: label for i, label in enumerate(loaded_model_openai_fr.classes_)
130+
}
131+
elif lang == "eng":
132+
if mode == "Instructor":
133+
results = embed_text(input_text)
134+
embedding_features = np.array(results)
135+
prediction = loaded_model_instructor_en.predict(
136+
embedding_features.reshape(1, -1)
137+
)
138+
confidence = loaded_model_instructor_en.predict_proba(
139+
embedding_features.reshape(1, -1)
140+
)
141+
label_dict = {
142+
i: label for i, label in enumerate(loaded_model_instructor_en.classes_)
143+
}
144+
elif mode == "openAI":
145+
results = embed_text_openai(input_text)
146+
embedding_features = np.array(results["data"][0]["embedding"])
147+
prediction = loaded_model_openai_en.predict(
148+
embedding_features.reshape(1, -1)
149+
)
150+
confidence = loaded_model_openai_en.predict_proba(
151+
embedding_features.reshape(1, -1)
152+
)
153+
label_dict = {
154+
i: label for i, label in enumerate(loaded_model_openai_en.classes_)
155+
}
156+
94157
st.write("Prediction: ", prediction[0])
95158
st.markdown("# Probability of each diagnosis")
96-
confidence = loaded_model.predict_proba(embedding.reshape(1, -1))
97159
for index, value in enumerate(confidence[0]):
98160
st.write(f"Confidence score for: {label_dict[index]}: {round(value*100)}% ")
99161

Lines changed: 213 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,213 @@
1+
import numpy as np
2+
import pandas as pd
3+
from sklearn.model_selection import StratifiedKFold
4+
import joblib
5+
from dotenv import load_dotenv
6+
from sklearn.metrics import classification_report, balanced_accuracy_score
7+
from sklearn.model_selection import cross_val_predict
8+
from sklearn.ensemble import RandomForestClassifier
9+
from sklearn.neural_network import MLPClassifier
10+
from sklearn.model_selection import GridSearchCV
11+
import pandas as pd
12+
import wandb
13+
14+
load_dotenv()
15+
16+
#### Import the data
17+
df = pd.read_csv("../data/text_dataset_translate.csv")
18+
Y = df["diag"].values
19+
20+
# Remove CFTD and unclear diagnosis
21+
df["diag"].value_counts()
22+
# Drop the rows with unclear diagnosis
23+
df = df[df["diag"] != "UNCLEAR"]
24+
# Do the same for the X array based on the df index
25+
26+
cv_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
27+
df["diag"].value_counts()
28+
29+
30+
LANGUAGE = ["fr", "en"]
31+
EMBEDDING_MODEL = ["instructor", "openai"]
32+
33+
Y = df["diag"].values
34+
35+
for lang in LANGUAGE:
36+
for embedding_method in EMBEDDING_MODEL:
37+
X = np.load(f"../data/embeddings/{embedding_method}_{lang}_embeddings.npy")
38+
39+
#########################################
40+
# MLPC
41+
param_grid = {
42+
"hidden_layer_sizes": [(400,), (200,), (100, 100), (200, 200)],
43+
"activation": ["tanh", "relu"],
44+
"solver": ["adam"],
45+
"learning_rate_init": [0.001, 0.01],
46+
"max_iter": [800, 1500, 2500],
47+
}
48+
49+
# Create grid search
50+
cls = MLPClassifier(random_state=42)
51+
gs_mlpc = GridSearchCV(
52+
cls, param_grid, scoring="accuracy", cv=cv_fold, verbose=1
53+
)
54+
gs_mlpc.fit(X, Y)
55+
best_mlpc = gs_mlpc.best_estimator_
56+
df_cv_search_rf = pd.DataFrame(gs_mlpc.cv_results_)
57+
# Print the best parameters and score
58+
print("Best parameters:", gs_mlpc.best_params_)
59+
print("Best score:", gs_mlpc.best_score_)
60+
joblib.dump(
61+
gs_mlpc, f"../models/{embedding_method}_{lang}_gridsearch_mlpc.joblib"
62+
)
63+
joblib.dump(best_mlpc, f"../models/{embedding_method}_{lang}_model_mlpc.joblib")
64+
65+
gs_mlpc = joblib.load(
66+
f"../models/{embedding_method}_{lang}_gridsearch_mlpc.joblib"
67+
)
68+
best_mlpc = joblib.load(
69+
f"../models/{embedding_method}_{lang}_model_mlpc.joblib"
70+
)
71+
72+
# Use cross_val_predict to get predicted labels and probabilities
73+
y_pred = cross_val_predict(best_mlpc, X, Y, cv=cv_fold)
74+
y_probas = cross_val_predict(
75+
best_mlpc, X, Y, cv=cv_fold, method="predict_proba"
76+
)
77+
# Compute classification report
78+
report = classification_report(
79+
Y, y_pred, target_names=best_mlpc.classes_, output_dict=True
80+
)
81+
82+
run = wandb.init(
83+
project="myo-text-classify",
84+
name=f"{embedding_method}_{lang}_mlpc",
85+
config={
86+
"embedding": f"{embedding_method}",
87+
"doc_lang": f"{lang}",
88+
"corpus": "complete_1704023_190reports",
89+
"model": "MLPClassifier",
90+
},
91+
)
92+
config = wandb.config
93+
best_params = gs_mlpc.best_params_
94+
best_score = gs_mlpc.best_score_
95+
best_std = gs_mlpc.cv_results_["std_test_score"][gs_mlpc.best_index_]
96+
balanced_accuracy_metric = balanced_accuracy_score(Y, y_pred)
97+
98+
wandb.log(
99+
{
100+
"Classification Report": report,
101+
"Best Params": best_params,
102+
"Best Score (gs)": best_score,
103+
"CV Std Devs (gs)": best_std,
104+
"Balanced Accuracy": balanced_accuracy_metric,
105+
}
106+
)
107+
wandb.sklearn.plot_confusion_matrix(Y, y_pred, best_mlpc.classes_)
108+
wandb.sklearn.plot_classifier(
109+
best_mlpc,
110+
X,
111+
X,
112+
Y,
113+
Y,
114+
y_pred,
115+
y_probas,
116+
labels=best_mlpc.classes_,
117+
model_name=f"{embedding_method}_{lang}_model",
118+
feature_names=None,
119+
)
120+
# Create artifact for best model
121+
model_artifact = wandb.Artifact(
122+
f"{embedding_method}_{lang}_model_mlpc", type="model"
123+
)
124+
# Add best estimator to artifact
125+
model_artifact.add_file(
126+
f"../models/{embedding_method}_{lang}_model_mlpc.joblib"
127+
)
128+
# Log artifact to WandB
129+
wandb.run.log_artifact(model_artifact)
130+
wandb.finish()
131+
132+
#############################################
133+
# RANDOM FOREST
134+
param_grid_rf = {
135+
"n_estimators": [10, 50, 100, 200],
136+
"max_depth": [None, 5, 10, 20],
137+
"min_samples_split": [2, 5, 10],
138+
"min_samples_leaf": [1, 2, 4],
139+
"class_weight": ["balanced", "balanced_subsample"],
140+
}
141+
142+
# Create grid search
143+
cls_rf = RandomForestClassifier(random_state=42)
144+
gs_rf = GridSearchCV(
145+
cls_rf, param_grid_rf, scoring="accuracy", cv=cv_fold, verbose=1
146+
)
147+
gs_rf.fit(X, Y)
148+
best_rf = gs_rf.best_estimator_
149+
df_cv_search_rf = pd.DataFrame(gs_rf.cv_results_)
150+
# Print the best parameters and score
151+
print("Best parameters:", gs_rf.best_params_)
152+
print("Best score:", gs_rf.best_score_)
153+
joblib.dump(gs_rf, f"../models/{embedding_method}_{lang}_gridsearch_rf.joblib")
154+
joblib.dump(best_rf, f"../models/{embedding_method}_{lang}_model_rf.joblib")
155+
156+
gs_rf = joblib.load(f"../models/{embedding_method}_{lang}_gridsearch_rf.joblib")
157+
best_rf = joblib.load(f"../models/{embedding_method}_{lang}_model_rf.joblib")
158+
159+
# Use cross_val_predict to get predicted labels and probabilities
160+
y_pred = cross_val_predict(best_rf, X, Y, cv=cv_fold)
161+
y_probas = cross_val_predict(best_rf, X, Y, cv=cv_fold, method="predict_proba")
162+
# Compute classification report
163+
report = classification_report(
164+
Y, y_pred, target_names=best_rf.classes_, output_dict=True
165+
)
166+
167+
run = wandb.init(
168+
project="myo-text-classify",
169+
name=f"{embedding_method}_{lang}_rf",
170+
config={
171+
"embedding": f"{embedding_method}",
172+
"doc_lang": f"{lang}",
173+
"corpus": "complete_1704023_190reports",
174+
"model": "RandomForest",
175+
},
176+
)
177+
config = wandb.config
178+
best_params = gs_mlpc.best_params_
179+
best_score = gs_mlpc.best_score_
180+
best_std = gs_mlpc.cv_results_["std_test_score"][gs_mlpc.best_index_]
181+
balanced_accuracy_metric = balanced_accuracy_score(Y, y_pred)
182+
183+
wandb.log(
184+
{
185+
"Classification Report": report,
186+
"Best Params": best_params,
187+
"Best Score (gs)": best_score,
188+
"CV Std Devs (gs)": best_std,
189+
"Balanced Accuracy": balanced_accuracy_metric,
190+
}
191+
)
192+
wandb.sklearn.plot_confusion_matrix(Y, y_pred, best_rf.classes_)
193+
wandb.sklearn.plot_classifier(
194+
best_rf,
195+
X,
196+
X,
197+
Y,
198+
Y,
199+
y_pred,
200+
y_probas,
201+
labels=best_rf.classes_,
202+
model_name=f"{embedding_method}_{lang}_model",
203+
feature_names=None,
204+
)
205+
# Create artifact for best model
206+
model_artifact = wandb.Artifact(
207+
f"{embedding_method}_{lang}_model_rf", type="model"
208+
)
209+
# Add best estimator to artifact
210+
model_artifact.add_file(f"../models/{embedding_method}_{lang}_model_rf.joblib")
211+
# Log artifact to WandB
212+
wandb.run.log_artifact(model_artifact)
213+
wandb.finish()

0 commit comments

Comments
 (0)