fixed some bugs in boolean model, get rakings from boolean model

leartoler · May 28, 2016 · 9d3ee23 · 9d3ee23
1 parent a2f9e45
commit 9d3ee23
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 15 deletions.
diff --git a/ir_evaluator.py b/ir_evaluator.py
@@ -1,6 +1,7 @@
 import matplotlib.pyplot as plot
 import os
 import numpy as np
+from matplotlib.backends.backend_pdf import PdfPages
 
 class IREvaluator(object):
     """description of class"""
@@ -56,7 +57,7 @@ def evaluate_query(self,ranking,relevants_docs_query,query_id):
            precision.append(self.get_precision(true_positives,false_positives))
 
 
-        recalls_levels = np.array([ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1. ]) 
+        recalls_levels = np.array([ 0. ,  0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9,  1., 1.1 ]) 
 
         interpolated_precisions = self.interpolate_precisions(recall,precision,recalls_levels)
 
@@ -159,7 +160,9 @@ def plot_results(self,recall, precision):
         path_save = raw_input("Please, provide the path where the results should be saved \n")
         if len(path_save) >0: 
             if os.path.exists(path_save):
-               plot.savefig(path_save)
+               pp = PdfPages(path_save)
+               plot.savefig(pp, format='pdf')
+               pp.savefig()
             else:
                os.makedirs(path_save)
 

diff --git a/ir_system.py b/ir_system.py
@@ -11,7 +11,7 @@
 from operator import itemgetter
 import abc
 import re
-
+import numpy as np
 ###################################################################################
 ## @class   InformationRetrievalSystem
 #  @brief   This class represents the InformationRetrievalSystem, i.e., basic methods 
@@ -123,14 +123,16 @@ def __init__(self,corpus,queries):
         print("\n--------------------------Executing TF IDF information retrieval model--------------------------\n")
         self.ranking_query=dict()
 
+        query_id=0
         if isinstance(queries, list): # launch queries
            for q in queries:
                print("\n-------------------------->Query = " + q ) 
-               self.ranking_function(corpus,q)
+               self.ranking_function(corpus,q,query_id)
+               query_id += 1;
 
         else:
             print("\n-------------------------->Query = " + queries ) 
-            self.ranking_function(corpus,queries)
+            self.ranking_function(corpus,queries,1)
 
     def create_documents_view(self,corpus):
         dictionary,pdocs = self.create_dictionary(corpus)
@@ -144,15 +146,15 @@ def create_query_view(self,query,dictionary):
         vq = dictionary.doc2bow(pq)
         return vq
 
-    def ranking_function(self,corpus, q):
+    def ranking_function(self,corpus, q, query_id):
         tfidf, dictionary = self.create_documents_view(corpus)
         loaded_corpus = corpora.MmCorpus('vsm_docs.mm')
         index = similarities.MatrixSimilarity(loaded_corpus, num_features=len(dictionary))
         vq=self.create_query_view(q,dictionary)
         self.query_weight = tfidf[vq]
         sim = index[self.query_weight]
         ranking = sorted(enumerate(sim), key=itemgetter(1), reverse=True)
-        self.ranking_query[q]=ranking # store the ranking of the query in a dict
+        self.ranking_query[query_id]=ranking # store the ranking of the query in a dict
         for doc, score in ranking:
             print ("[ Score = " + "%.3f" % round(score, 3) + "] " + corpus[doc]);
 
@@ -164,26 +166,29 @@ def __init__(self,corpus,queries):
         print("\n--------------------------Executing Boolean information retrieval model--------------------------\n")
         self.ranking_query=dict()
 
+        query_id=0
         if isinstance(queries, list): # launch queries
            for q in queries:
                print("\n-------------------------->Query = " + q ) 
                or_set,and_set = self.preprocess_query(q)
-               dict_matches = self.process_operators(corpus,or_set,and_set)
+               dict_matches = self.process_operators(corpus,or_set,and_set,query_id)
                self.print_result(dict_matches)
+               query_id += 1
         else:
              print("\n-------------------------->Query = " + queries ) 
              or_set,and_set = self.preprocess_query(queries)
-             dict_matches = self.process_operators([corpus],or_set,and_set)
+             dict_matches = self.process_operators(corpus,or_set,and_set,1)
              self.print_result(dict_matches)
 
-    def process_operators(self,corpus,or_set,and_set):   
+    def process_operators(self,corpus,or_set,and_set,query_id):   
         or_list = [val for sublist in or_set for val in sublist]     
         for or_txt in or_list: # assign score 1 to documents that match with either phrase with or
             dict_matches = self.ranking_function(corpus,or_txt)
         if len(and_set) > 0: 
           and_list = [val for sublist in and_set for val in sublist]
           and_txt= ', '.join(and_list) # treat the and_set as a single query separated by commas
           dict_matches =  self.ranking_function(corpus,and_txt)
+        self.ranking_query[query_id]=dict_matches.items()
         return dict_matches
 
     def create_documents_view(self,corpus):
@@ -244,8 +249,11 @@ def __init__(self,corpus,queries):
 
  def create_documents_view(self,corpus):
         dictionary,pdocs = self.create_dictionary(corpus)
-        self.docs2bows(corpus, dictionary,pdocs)
-        tf = corpora.MmCorpus('vsm_docs.mm') # Recover the corpus
+        bow = self.docs2bows(corpus, dictionary,pdocs)     
+        tf = [[(w[0], 1 + np.log2(w[1])) for w in v] for v in bow] # TF model
+        # tf = corpora.MmCorpus('vsm_docs.mm') # Recover the corpus
+        loaded_corpus = corpora.MmCorpus('vsm_docs.mm') # Recover the corpus
+        tfidf = models.TfidfModel(loaded_corpus)
         return tf, dictionary
 
  def create_query_view(self,query,dictionary):
@@ -258,8 +266,8 @@ def ranking_function(self,corpus, q,query_id):
         loaded_corpus = corpora.MmCorpus('vsm_docs.mm')
         index = similarities.MatrixSimilarity(loaded_corpus, num_features=len(dictionary))
         vq=self.create_query_view(q,dictionary)
-        self.query_weight = tf[vq]
-        sim = index[self.query_weight.slice_]
+        self.query_weight  = [(w[0], 1 + np.log2(w[1])) for w in vq]
+        sim = index[self.query_weight]
         ranking = sorted(enumerate(sim), key=itemgetter(1), reverse=True)
         self.ranking_query[query_id]=ranking # store the ranking of the query in a dict
         for doc, score in ranking:

diff --git a/main.py b/main.py
@@ -68,7 +68,6 @@ def create_ir_system(irmodel_choice,corpus,query):
               spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
               relevances=[]
               for row in spamreader:
-                  print ', '.join(row)
                   relevances.append(row)
 
          ir_evaluator.IREvaluator(relevances,ir.ranking_query)