Skip to content

Commit

Permalink
mend
Browse files Browse the repository at this point in the history
  • Loading branch information
jgeysen committed May 21, 2021
1 parent 7f8a2b9 commit 8ed3abd
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 30 deletions.
79 changes: 49 additions & 30 deletions eigen_tech_project/inverted_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ def sentences(self) -> List[Tuple[int, str]]:
format) and the corresponding file id.
Example return:
[(1, "First sentence of the first document."), (1, "Second sentence of the first document."), ..., (n, "m-th sentence of the n-th document.")]
[(1, "First sentence of the first document."),
(1, "Second sentence of the first document."),
..., (n, "m-th sentence of the n-th document.")]
Returns:
List: List of tuples containing the file id and the sentences for the files in the path
Expand All @@ -86,7 +88,9 @@ def processed_sentences(self) -> List[Tuple]:
file id.
Example return:
[(1, "First sentence of the first document.", "first sentence first document"), (1, "Second sentence of the first document.", "second sentence second document"), ..., (n, "Last sentence of the last document.", "last sentence last document")]
[(1, "First sentence of the first document.", "first sentence first document"),
(1, "Second sentence of the first document.", "second sentence second document"),
..., (n, "Last sentence of the last document.", "last sentence last document")]
Returns:
List: List of tuples containing the file id and the sentences for the files in the path
Expand All @@ -110,9 +114,10 @@ def count_vectorizer(self):

@cached_property
def document_term_matrix(self) -> csr_matrix:
"""Returns sparse matrix (scipy.sparse.csr_matrix) of size (number of
sentences x size of vocabulary), mapping for each sentence the
occurrence of each word in the vocabulary.
"""Returns sparse document-term matrix (scipy.sparse.csr_matrix).
The document-term matrix is of size (number of sentences x size of vocabulary) and maps for each sentence the
occurrence of each vocabulary word.
Returns:
csr_matrix: sparse document-term matrix.
Expand All @@ -121,68 +126,82 @@ def document_term_matrix(self) -> csr_matrix:
return self.count_vectorizer.transform(data)

@cached_property
def vocabulary(self) -> List:
"""Returns the contents of each document in list of tuples.
def vocabulary(self) -> List[str]:
"""Returns list of unique words that occur in the data.
The words in this list are processed, meaning that the lemma representation of the original token is used.
The list is ordered alphabetically and maps 1-1 to the columns of the document-term matrix.
Example return:
[(1, "The contents of the first document"), ..., (n, "The contents of the n-th document.")]
["abandon", "ability", "abroad", ..., "zone"]
Returns:
List: Tupled list containing the id and contents of the documents in path given at InvertedIndex
initialisation.
List: List containing the lemmatized vocabulary.
"""
return self.count_vectorizer.get_feature_names()

@cached_property
def lemma_frequencies(self) -> List:
"""Returns the contents of each document in list of tuples.
"""Returns a list of integers, representing the frequency that each
word occurs in the vocabulary.
The position of each integer in this list corresponds a lemma in the
alphabetical vocabulary list (with the same respective position). The integers represent the frequency
a lemma occurs in the entire corpus (across sentences and files).
Example return:
[(1, "The contents of the first document"), ..., (n, "The contents of the n-th document.")]
[4, 1, 1, ..., 2]
Returns:
List: Tupled list containing the id and contents of the documents in path given at InvertedIndex
initialisation.
List: List of integers, representing word frequency.
"""
return self.document_term_matrix.sum(axis=0).tolist()[0]

@cached_property
def lemma_occurrences(self) -> List[List]:
"""Returns the contents of each document in list of tuples.
def lemma_occurrences(self) -> List[List[int]]:
"""Returns the a list of lists, containing sentence ids.
This list has a length equal to the vocabulary size. The position of each sublist in this
list maps directly to a lemma (with the same, alphabetical position) in the vocabulary list.
The sublist here represents a collection of the sentence ids in which a lemma occurs.
These sentence ids correspond to rows in the sentence-term matrix.
Example return:
[(1, "The contents of the first document"), ..., (n, "The contents of the n-th document.")]
[[1, 5, 29, 84], [1], ..., [128, 356]]
Returns:
List: Tupled list containing the id and contents of the documents in path given at InvertedIndex
initialisation.
List: List of lists, each containing sentence ids mapping the vocabulary to the sentences.
"""
return self.document_term_matrix.transpose().tolil().rows.tolist()

@cached_property
def inverted_index(self) -> List[Tuple[str, int, set]]:
"""Returns the contents of each document in list of tuples.
"""Returns a list of tuples, each containing a lemma, the total
frequency of that lemma in the corpus and a collection of the sentence
ids that lemma occurs in.
A mapping between a lemma and the documents (in this case: sentences) that lemma occurs in, is called an
inverted index.
Example return:
[(1, "The contents of the first document"), ..., (n, "The contents of the n-th document.")]
[("abandon", 2, [1, 5, 29, 84]), ..., ("zone", 2, [128, 356])]
Returns:
List: Tupled list containing the id and contents of the documents in path given at InvertedIndex
initialisation.
List: List of tuples, mapping vocabulary to frequency to sentence ids.
"""
return list(
zip(self.vocabulary, self.lemma_frequencies, self.lemma_occurrences)
)

def mapped_inverted_index(self, save=True):
"""Returns the contents of each document in list of tuples.
Example return:
[(1, "The contents of the first document"), ..., (n, "The contents of the n-th document.")]
def mapped_inverted_index(self, save: bool = False) -> pd.DataFrame:
"""Returns a dataframe mapping the inverted index back to the original
sentences.
Args:
save: Boolean, indicating if one wants to directly save the mapped_inverted_index into a .csv file
in the current directory.
Returns:
List: Tupled list containing the id and contents of the documents in path given at InvertedIndex
initialisation.
pd.DataFrame():
"""
df_input = pd.DataFrame(
self.processed_sentences,
Expand Down
2 changes: 2 additions & 0 deletions eigen_tech_project/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

@contextlib.contextmanager
def no_stdout():
"""Yields a context in which one can run a (class) method where nothing is
returned."""
save_stdout = sys.stdout
sys.stdout = io.BytesIO()
yield
Expand Down

0 comments on commit 8ed3abd

Please sign in to comment.