Skip to content

Commit

Permalink
Few modifs to exbert code (doesn't work yet)
Browse files Browse the repository at this point in the history
  • Loading branch information
ant-louis committed Apr 14, 2020
1 parent d8d420c commit f5e82f6
Show file tree
Hide file tree
Showing 10 changed files with 59 additions and 15 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
_data/*
extrinsic_evaluation/qa/UnsupervisedQA/data/*
intrinsic_evaluation/data/*

intrinsic_evaluation/exbert/server/data/dev/*

#----------------
# Weird stuff
Expand Down
2 changes: 2 additions & 0 deletions intrinsic_evaluation/exbert/environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ channels:
- anaconda
dependencies:
- connexion=1.5.3
- werkzeug==0.15.6
- flask==1.1.1
- h5py
- spacy
- boto3
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,10 @@ def has_connected_dash(s):
def text2sentences(path):
"""Extract the text from the indicated path"""
with open(path, 'r') as src:
doc = nlp(src.read())

sentences = [sent.string.strip() for sent in doc.sents]
#doc = nlp(src.read())
#sentences = [sent.string.strip() for sent in doc.sents]

sentences = src.readlines()
return sentences

# String -> String
Expand Down
10 changes: 10 additions & 0 deletions intrinsic_evaluation/exbert/server/exbert.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Metadata-Version: 1.0
Name: exbert
Version: 0.0.0
Summary: Vis
Home-page: UNKNOWN
Author: IBM Research AI
Author-email: UNKNOWN
License: UNKNOWN
Description: UNKNOWN
Platform: UNKNOWN
10 changes: 10 additions & 0 deletions intrinsic_evaluation/exbert/server/exbert.egg-info/SOURCES.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
setup.py
exbert.egg-info/PKG-INFO
exbert.egg-info/SOURCES.txt
exbert.egg-info/dependency_links.txt
exbert.egg-info/top_level.txt
pytorch_pretrained_bert/__init__.py
pytorch_pretrained_bert/file_utils.py
pytorch_pretrained_bert/modeling.py
pytorch_pretrained_bert/optimization.py
pytorch_pretrained_bert/tokenization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pytorch_pretrained_bert
17 changes: 9 additions & 8 deletions intrinsic_evaluation/exbert/server/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import connexion
import os
import pickle
import utils.path_fixes as pf
import utils.path_fixes as pf # This file contains all data paths.
import numpy as np

from data.processing.create_faiss import Indexes, ContextIndexes
Expand All @@ -40,10 +40,10 @@ def __init__(self):

def load_info(self):
"""Allow values to have default NONE, load all at once after first load of flask"""
self.embedding_faiss = Indexes(pf.WOZ_EMBEDDINGS)
self.context_faiss = ContextIndexes(pf.WOZ_CONTEXT)
self.embedding_corpus = AttentionCorpusEmbeddings(pf.WOZ_HDF5)
self.context_corpus = AttentionCorpusEmbeddings(pf.WOZ_CONTEXT_HDF5)
self.embedding_faiss = Indexes(pf.CISCO_EMBEDDINGS)
self.context_faiss = ContextIndexes(pf.CISCO_CONTEXT)
self.embedding_corpus = AttentionCorpusEmbeddings(pf.CISCO_HDF5)
self.context_corpus = AttentionCorpusEmbeddings(pf.CISCO_CONTEXT_HDF5)

faiss_loader = FaissLoader()

Expand All @@ -61,12 +61,13 @@ def send_static_client(path):
"""
return send_from_directory(str(pf.CLIENT_DIST), path)


#======================================================================
## INITIALIZATION OF MODEL ##
#======================================================================
bert_version = 'bert-base-uncased'
model = BertModel.from_pretrained(bert_version)
tokenizer = BertTokenizer.from_pretrained(bert_version)
bert_version = 'bert-base-cased'
model = BertModel.from_pretrained(bert_version, cache_dir='/raid/antoloui/Master-thesis/_cache')
tokenizer = BertTokenizer.from_pretrained(bert_version, cache_dir='/raid/antoloui/Master-thesis/_cache')
details_data = AttentionDetailsData(model, tokenizer)

p_file = "_store/simple.pckl"
Expand Down
22 changes: 20 additions & 2 deletions intrinsic_evaluation/exbert/server/utils/path_fixes.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
ROOT_DIR = Path(os.path.abspath(__file__)).parent.parent.parent
WIKI_PATH = DATASET_DIR / 'wikipedia'

# SERVING STATIC FILES
CLIENT_DIST = ROOT_DIR / 'client' / 'dist'

# ==============================================================
# WIZARD OF OZ
# (should I put this into a dictionary? That would make more sense...)
Expand All @@ -27,5 +30,20 @@
WOZ_CONTEXT_HDF5 = WOZ_CONTEXT / 'combined.hdf5'
WOZ_CONTEXT_LAYER_TEMPLATE = WOZ_CONTEXT / LAYER_TEMPLATE

# SERVING STATIC FILES
CLIENT_DIST = ROOT_DIR / 'client' / 'dist'


# ==============================================================
# CISCO
# ==============================================================
CISCO_DIR = DATA_DIR / 'dev'
CISCO_PATH = '/raid/antoloui/Master-thesis/_data/cleaned/dev.raw'

## EMBEDDINGS
CISCO_EMBEDDINGS = CISCO_DIR / 'embeddings'
CISCO_HDF5 = CISCO_EMBEDDINGS / 'combined.hdf5'
CISCO_LAYER_TEMPLATE = CISCO_EMBEDDINGS / LAYER_TEMPLATE

## HEAD INFO
CISCO_CONTEXT = CISCO_DIR / 'headContext'
CISCO_CONTEXT_HDF5 = CISCO_CONTEXT / 'combined.hdf5'
CISCO_CONTEXT_LAYER_TEMPLATE = CISCO_CONTEXT / LAYER_TEMPLATE
2 changes: 1 addition & 1 deletion search/cisco/create_index.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
INDIR=/raid/antoloui/Master-thesis/_data/embeddings/
OUTDIR=/raid/antoloui/Master-thesis/_data/search/cisco/
N_GPU=8
METHOD=ip
METHOD=cos


python -W ignore -u tools/create_faiss_index.py \
Expand Down

0 comments on commit f5e82f6

Please sign in to comment.