Skip to content

Commit

Permalink
fixed some bugs in boolean model, get rakings from boolean model
Browse files Browse the repository at this point in the history
  • Loading branch information
yolanda93 committed May 28, 2016
1 parent a2f9e45 commit 9d3ee23
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 15 deletions.
7 changes: 5 additions & 2 deletions ir_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import matplotlib.pyplot as plot
import os
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages

class IREvaluator(object):
"""description of class"""
Expand Down Expand Up @@ -56,7 +57,7 @@ def evaluate_query(self,ranking,relevants_docs_query,query_id):
precision.append(self.get_precision(true_positives,false_positives))


recalls_levels = np.array([ 0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])
recalls_levels = np.array([ 0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1., 1.1 ])

interpolated_precisions = self.interpolate_precisions(recall,precision,recalls_levels)

Expand Down Expand Up @@ -159,7 +160,9 @@ def plot_results(self,recall, precision):
path_save = raw_input("Please, provide the path where the results should be saved \n")
if len(path_save) >0:
if os.path.exists(path_save):
plot.savefig(path_save)
pp = PdfPages(path_save)
plot.savefig(pp, format='pdf')
pp.savefig()
else:
os.makedirs(path_save)

Expand Down
32 changes: 20 additions & 12 deletions ir_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from operator import itemgetter
import abc
import re

import numpy as np
###################################################################################
## @class InformationRetrievalSystem
# @brief This class represents the InformationRetrievalSystem, i.e., basic methods
Expand Down Expand Up @@ -123,14 +123,16 @@ def __init__(self,corpus,queries):
print("\n--------------------------Executing TF IDF information retrieval model--------------------------\n")
self.ranking_query=dict()

query_id=0
if isinstance(queries, list): # launch queries
for q in queries:
print("\n-------------------------->Query = " + q )
self.ranking_function(corpus,q)
self.ranking_function(corpus,q,query_id)
query_id += 1;

else:
print("\n-------------------------->Query = " + queries )
self.ranking_function(corpus,queries)
self.ranking_function(corpus,queries,1)

def create_documents_view(self,corpus):
dictionary,pdocs = self.create_dictionary(corpus)
Expand All @@ -144,15 +146,15 @@ def create_query_view(self,query,dictionary):
vq = dictionary.doc2bow(pq)
return vq

def ranking_function(self,corpus, q):
def ranking_function(self,corpus, q, query_id):
tfidf, dictionary = self.create_documents_view(corpus)
loaded_corpus = corpora.MmCorpus('vsm_docs.mm')
index = similarities.MatrixSimilarity(loaded_corpus, num_features=len(dictionary))
vq=self.create_query_view(q,dictionary)
self.query_weight = tfidf[vq]
sim = index[self.query_weight]
ranking = sorted(enumerate(sim), key=itemgetter(1), reverse=True)
self.ranking_query[q]=ranking # store the ranking of the query in a dict
self.ranking_query[query_id]=ranking # store the ranking of the query in a dict
for doc, score in ranking:
print ("[ Score = " + "%.3f" % round(score, 3) + "] " + corpus[doc]);

Expand All @@ -164,26 +166,29 @@ def __init__(self,corpus,queries):
print("\n--------------------------Executing Boolean information retrieval model--------------------------\n")
self.ranking_query=dict()

query_id=0
if isinstance(queries, list): # launch queries
for q in queries:
print("\n-------------------------->Query = " + q )
or_set,and_set = self.preprocess_query(q)
dict_matches = self.process_operators(corpus,or_set,and_set)
dict_matches = self.process_operators(corpus,or_set,and_set,query_id)
self.print_result(dict_matches)
query_id += 1
else:
print("\n-------------------------->Query = " + queries )
or_set,and_set = self.preprocess_query(queries)
dict_matches = self.process_operators([corpus],or_set,and_set)
dict_matches = self.process_operators(corpus,or_set,and_set,1)
self.print_result(dict_matches)

def process_operators(self,corpus,or_set,and_set):
def process_operators(self,corpus,or_set,and_set,query_id):
or_list = [val for sublist in or_set for val in sublist]
for or_txt in or_list: # assign score 1 to documents that match with either phrase with or
dict_matches = self.ranking_function(corpus,or_txt)
if len(and_set) > 0:
and_list = [val for sublist in and_set for val in sublist]
and_txt= ', '.join(and_list) # treat the and_set as a single query separated by commas
dict_matches = self.ranking_function(corpus,and_txt)
self.ranking_query[query_id]=dict_matches.items()
return dict_matches

def create_documents_view(self,corpus):
Expand Down Expand Up @@ -244,8 +249,11 @@ def __init__(self,corpus,queries):

def create_documents_view(self,corpus):
dictionary,pdocs = self.create_dictionary(corpus)
self.docs2bows(corpus, dictionary,pdocs)
tf = corpora.MmCorpus('vsm_docs.mm') # Recover the corpus
bow = self.docs2bows(corpus, dictionary,pdocs)
tf = [[(w[0], 1 + np.log2(w[1])) for w in v] for v in bow] # TF model
# tf = corpora.MmCorpus('vsm_docs.mm') # Recover the corpus
loaded_corpus = corpora.MmCorpus('vsm_docs.mm') # Recover the corpus
tfidf = models.TfidfModel(loaded_corpus)
return tf, dictionary

def create_query_view(self,query,dictionary):
Expand All @@ -258,8 +266,8 @@ def ranking_function(self,corpus, q,query_id):
loaded_corpus = corpora.MmCorpus('vsm_docs.mm')
index = similarities.MatrixSimilarity(loaded_corpus, num_features=len(dictionary))
vq=self.create_query_view(q,dictionary)
self.query_weight = tf[vq]
sim = index[self.query_weight.slice_]
self.query_weight = [(w[0], 1 + np.log2(w[1])) for w in vq]
sim = index[self.query_weight]
ranking = sorted(enumerate(sim), key=itemgetter(1), reverse=True)
self.ranking_query[query_id]=ranking # store the ranking of the query in a dict
for doc, score in ranking:
Expand Down
1 change: 0 additions & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ def create_ir_system(irmodel_choice,corpus,query):
spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
relevances=[]
for row in spamreader:
print ', '.join(row)
relevances.append(row)

ir_evaluator.IREvaluator(relevances,ir.ranking_query)
Expand Down

0 comments on commit 9d3ee23

Please sign in to comment.