Add Zipfs law report

- Add parameter report for collection. - Add Zipfs law plot to graph.
louislefevre · Apr 10, 2021 · 118e120 · 118e120
1 parent 5b25a32
commit 118e120
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 15 deletions.
diff --git a/README.md b/README.md
@@ -20,7 +20,7 @@ given query.
 ### Util
 - FileManager.py - Reads and writes to a given file.
 - TextProcessor.py - Performs text preprocessing on a collection or passage.
-- Plotter.py - Plots the term frequency graph.
+- Plotter.py - Generates a term distribution plot, as well as a parameter report for the collection.
 - Math.py - Various mathematical formula functions.
 
 ## How to Run
@@ -64,6 +64,7 @@ The program can be initialised by running *start.py*, which accepts parameters i
 - [matplotlib](https://pypi.org/project/matplotlib/)
 - [nltk](https://pypi.org/project/nltk/)
 - [num2words](https://pypi.org/project/num2words/)
+- [tabulate](https://pypi.org/project/tabulate/)
 - [punkt (nltk module)](http://www.nltk.org/api/nltk.tokenize.html?highlight=punkt)
 - [stopwords (nltk module)](https://www.nltk.org/api/nltk.corpus.html)  
 *NLTK modules are downloaded automatically at runtime*
diff --git a/retrieval/DatasetParser.py b/retrieval/DatasetParser.py
@@ -1,11 +1,13 @@
 import os
+from collections import Counter
 
 from retrieval.data.Dataset import Dataset
 from retrieval.data.InvertedIndex import InvertedIndex
 from retrieval.models.BM25 import BM25
 from retrieval.models.QueryLikelihood import QueryLikelihood
 from retrieval.models.VectorSpace import VectorSpace
 from retrieval.util.FileManager import read_pickle, write_pickle
+from util.Plotter import zipfs
 
 
 class DatasetParser:
@@ -20,7 +22,7 @@ def parse(self, model: str, plot_freq: bool = False, smoothing: str = None,
         index = self._generate_index(index_path, self._passages)
 
         if plot_freq:
-            index.plot()
+            self._zipfs_law(index.counter)
         if model == 'bm25':
             model = BM25(index, self._mapping)
         elif model == 'vs':
@@ -33,6 +35,10 @@ def parse(self, model: str, plot_freq: bool = False, smoothing: str = None,
         print("Ranking queries against passages...")
         return {qid: model.rank(qid, query) for qid, query in self._queries.items()}
 
+    @staticmethod
+    def _zipfs_law(counter: Counter):
+        zipfs(counter)
+
     @staticmethod
     def _generate_index(file: str, passages: dict[int, str]) -> InvertedIndex:
         if os.path.isfile(file) and not os.stat(file).st_size == 0:

diff --git a/retrieval/data/InvertedIndex.py b/retrieval/data/InvertedIndex.py
@@ -3,7 +3,6 @@
 from dataclasses import dataclass
 
 from retrieval.util.Math import tf_idf
-from retrieval.util.Plotter import plot_frequency
 from retrieval.util.TextProcessor import clean_collection
 
 
@@ -16,9 +15,6 @@ def parse(self):
         self._index_passages()
         self._tfidf_passages()
 
-    def plot(self):
-        plot_frequency(self.counter)
-
     def _index_passages(self):
         for pid, passage in self._collection.items():
             for term in passage:

diff --git a/retrieval/util/Plotter.py b/retrieval/util/Plotter.py
@@ -1,21 +1,47 @@
 from collections import Counter
 
 from matplotlib import pyplot as plt
+from tabulate import tabulate
 
-from retrieval.util.Math import normalise
+from util.FileManager import write_txt
 
 
-def plot_frequency(counter: Counter):
-    frequencies = normalise(counter.values())
-    frequencies.sort(reverse=True)
-    frequencies = frequencies[:100]
-    _generate_figure(frequencies, title="Term Frequency", x_label="Rank",
-                     y_label="Probability", file_name='term-frequencies.png')
+def zipfs(counter: Counter):
+    prob_distribution = []
+    rows = []
+    total_count = sum(counter.values())
+    c = 0.0
 
+    for rank, (word, freq) in enumerate(counter.most_common(100)):
+        rank += 1
+        p = freq / total_count
+        pr = rank * p
+        c += pr
+        prob_distribution.append(p)
+        rows.append([word, freq, rank, "{:.3f}".format(p), "{:.3f}".format(pr)])
 
-def _generate_figure(data, title=None, x_label=None, y_label=None, file_name='figure.png'):
-    plt.plot(data)
+    _plot_distribution(prob_distribution)
+    _report_parameters(rows, c)
+
+
+def _plot_distribution(prob_distribution: list[float]):
+    zipf_distribution = [0.1 / i for i in range(1, 101)]
+    _generate_figure(prob_distribution, zipf_distribution, title="Zipf's Law", x_label="Rank",
+                     y_label="Probability", file_name='zipf-plot.png')
+
+
+def _report_parameters(rows: list[list], c: float):
+    table = tabulate(rows, headers=['Word', 'Freq', 'r', 'Pr', 'r*Pr'])
+    c = round(c / len(rows), 3)
+    data = table + f'\n\nc = {c}'
+    write_txt('zipf-parameters.txt', data)
+
+
+def _generate_figure(*data, title=None, x_label=None, y_label=None, file_name='figure.png'):
+    for d in data:
+        plt.plot(d)
     plt.title(title)
     plt.xlabel(x_label)
     plt.ylabel(y_label)
+    plt.grid()
     plt.savefig(file_name)