Skip to content

Commit

Permalink
Add Zipfs law report
Browse files Browse the repository at this point in the history
- Add parameter report for collection.
- Add Zipfs law plot to graph.
  • Loading branch information
louislefevre committed Apr 10, 2021
1 parent 5b25a32 commit 118e120
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 15 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ given query.
### Util
- FileManager.py - Reads and writes to a given file.
- TextProcessor.py - Performs text preprocessing on a collection or passage.
- Plotter.py - Plots the term frequency graph.
- Plotter.py - Generates a term distribution plot, as well as a parameter report for the collection.
- Math.py - Various mathematical formula functions.

## How to Run
Expand Down Expand Up @@ -64,6 +64,7 @@ The program can be initialised by running *start.py*, which accepts parameters i
- [matplotlib](https://pypi.org/project/matplotlib/)
- [nltk](https://pypi.org/project/nltk/)
- [num2words](https://pypi.org/project/num2words/)
- [tabulate](https://pypi.org/project/tabulate/)
- [punkt (nltk module)](http://www.nltk.org/api/nltk.tokenize.html?highlight=punkt)
- [stopwords (nltk module)](https://www.nltk.org/api/nltk.corpus.html)
*NLTK modules are downloaded automatically at runtime*
8 changes: 7 additions & 1 deletion retrieval/DatasetParser.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import os
from collections import Counter

from retrieval.data.Dataset import Dataset
from retrieval.data.InvertedIndex import InvertedIndex
from retrieval.models.BM25 import BM25
from retrieval.models.QueryLikelihood import QueryLikelihood
from retrieval.models.VectorSpace import VectorSpace
from retrieval.util.FileManager import read_pickle, write_pickle
from util.Plotter import zipfs


class DatasetParser:
Expand All @@ -20,7 +22,7 @@ def parse(self, model: str, plot_freq: bool = False, smoothing: str = None,
index = self._generate_index(index_path, self._passages)

if plot_freq:
index.plot()
self._zipfs_law(index.counter)
if model == 'bm25':
model = BM25(index, self._mapping)
elif model == 'vs':
Expand All @@ -33,6 +35,10 @@ def parse(self, model: str, plot_freq: bool = False, smoothing: str = None,
print("Ranking queries against passages...")
return {qid: model.rank(qid, query) for qid, query in self._queries.items()}

@staticmethod
def _zipfs_law(counter: Counter):
zipfs(counter)

@staticmethod
def _generate_index(file: str, passages: dict[int, str]) -> InvertedIndex:
if os.path.isfile(file) and not os.stat(file).st_size == 0:
Expand Down
4 changes: 0 additions & 4 deletions retrieval/data/InvertedIndex.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from dataclasses import dataclass

from retrieval.util.Math import tf_idf
from retrieval.util.Plotter import plot_frequency
from retrieval.util.TextProcessor import clean_collection


Expand All @@ -16,9 +15,6 @@ def parse(self):
self._index_passages()
self._tfidf_passages()

def plot(self):
plot_frequency(self.counter)

def _index_passages(self):
for pid, passage in self._collection.items():
for term in passage:
Expand Down
44 changes: 35 additions & 9 deletions retrieval/util/Plotter.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,47 @@
from collections import Counter

from matplotlib import pyplot as plt
from tabulate import tabulate

from retrieval.util.Math import normalise
from util.FileManager import write_txt


def plot_frequency(counter: Counter):
frequencies = normalise(counter.values())
frequencies.sort(reverse=True)
frequencies = frequencies[:100]
_generate_figure(frequencies, title="Term Frequency", x_label="Rank",
y_label="Probability", file_name='term-frequencies.png')
def zipfs(counter: Counter):
prob_distribution = []
rows = []
total_count = sum(counter.values())
c = 0.0

for rank, (word, freq) in enumerate(counter.most_common(100)):
rank += 1
p = freq / total_count
pr = rank * p
c += pr
prob_distribution.append(p)
rows.append([word, freq, rank, "{:.3f}".format(p), "{:.3f}".format(pr)])

def _generate_figure(data, title=None, x_label=None, y_label=None, file_name='figure.png'):
plt.plot(data)
_plot_distribution(prob_distribution)
_report_parameters(rows, c)


def _plot_distribution(prob_distribution: list[float]):
zipf_distribution = [0.1 / i for i in range(1, 101)]
_generate_figure(prob_distribution, zipf_distribution, title="Zipf's Law", x_label="Rank",
y_label="Probability", file_name='zipf-plot.png')


def _report_parameters(rows: list[list], c: float):
table = tabulate(rows, headers=['Word', 'Freq', 'r', 'Pr', 'r*Pr'])
c = round(c / len(rows), 3)
data = table + f'\n\nc = {c}'
write_txt('zipf-parameters.txt', data)


def _generate_figure(*data, title=None, x_label=None, y_label=None, file_name='figure.png'):
for d in data:
plt.plot(d)
plt.title(title)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.grid()
plt.savefig(file_name)

0 comments on commit 118e120

Please sign in to comment.