Skip to content

Commit

Permalink
Python integration test (castorini#905)
Browse files Browse the repository at this point in the history
  • Loading branch information
w329li authored and lintool committed Dec 1, 2019
1 parent ea06575 commit c4f3514
Showing 1 changed file with 141 additions and 76 deletions.
217 changes: 141 additions & 76 deletions src/main/python/solr_integration_test.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
import json
import os,sys
import subprocess
import argparse
import requests


'''
this moduele can run indexing and searching on given data collection(eg. robust04,WashingPost)
This module can execute index and search on given data collection(eg. robust04,WashingPost)
It must run under parental directory "anserini" with supported solr path
It must run under parental directory "anserini" with supported solrini package
This module requires 2 parameters (collection name and collection path) as input to extract
further collection information based on "index_options" below, which is in json format.
More details can be found here:
It required 1 parameter(collectin name) as input to indentify all other collection information in
json format "index_options" below.
This instruction can help you to understand how it works:
https://github.com/castorini/anserini/blob/master/docs/solrini.md
'''
Expand All @@ -21,32 +25,18 @@
"core18": {
"collection": "WashingtonPostCollection",
"generator": "WapoGenerator",
"input_path":"core18/WashingtonPost.v2/data", ### Modify it based on ur own collection path
##"input_path":"core18/WashingtonPost.v2/data", ### Modify it based on ur own collection path
"thread_num": 8,
"topic_reader":"Trec",
"topic_path":"src/main/resources/topics-and-qrels/topics.core18.txt",
"qrel_path":"src/main/resources/topics-and-qrels/qrels.core18.txt"
},
"core17": {
"collection": "NewYorkTimesCollection",
"generator": "JsoupGenerator"
},
"cw09b": {
"collection": "ClueWeb09Collection",
"generator": "JsoupGenerator"
},
"cw12b": {
"collection": "ClueWeb12Collection",
"generator": "JsoupGenerator"
},
"gov2": {
"collection": "TrecwebCollection",
"generator": "JsoupGenerator"
"qrel_path":"src/main/resources/topics-and-qrels/qrels.core18.txt",
"expected_indexed_doc":595037,
"MAP": "0.2495",
"P30": "0.3567"
},
"robust04": {
"collection": "TrecCollection",
"generator": "JsoupGenerator",
"input_path":"myrobust04/disk45", ### Modify it based on ur own collection path
"thread_num": 8,
"topic_reader":"Trec",
"topic_path":"src/main/resources/topics-and-qrels/topics.robust04.txt",
Expand All @@ -55,92 +45,167 @@
}



class Solr_command:
def __init__(self,arguments):
self.solr_on = "solrini/bin/solr start -c -m 8G"
self.solr_config = "pushd src/main/resources/solr && ./solr.sh ../../../../solrini localhost:9983 && popd"
self.name = arguments[0]

def command_execution(self, command):
# read_file_into_list works with trec_eval() to extract actual MAP, P_30 results from "this_is_temp_file.txt" file
def read_file_into_list(keywords):
if (os.path.exists("this_is_temp_file.txt") == True):
with open("this_is_temp_file.txt", "r") as f:
content = f.read().splitlines()
res = []
for keyword in keywords:
for n in content:
if (keyword in n):res.append(n)
###print(res)
return res


class SolrClient:
def __init__(self,name,input_path,skip_index):
if (name == None): raise Exception("Need collection name, but received None")
if (input_path == None): raise Exception("Need path of collection, but received None")
self.name = name
self.input_path = input_path
self.output_path = "run.solr.{}.bm25.topics.{}.txt".format(name,name)
self.skip_index = skip_index

def command_execution(self, command_path, command):
print(command)
subprocess.run(command, shell = True)
if (os.path.exists(command_path) == False):
sys.exit("default solr path: {} does not exist! Cannot execute the command. Please check:\n \
1.Solr is installed under correct path \
2.This program is run under the correct path".format(command_path))
output = subprocess.run(command, shell = True)


def turn_on_server(self):
def start_server(self):
print("we turn on solr server")
self.command_execution(self.solr_on)
self.command_execution(self.solr_config)
print("server is on")
#### we check whether we have correct solr path first
solr_on = "solrini/bin/solr restart -c -m 8G"
solr_config = "pushd src/main/resources/solr && ./solr.sh ../../../../solrini localhost:9983 && popd"
self.command_execution("solrini/bin/solr", solr_on)
self.command_execution("src/main/resources/solr", solr_config)
print("Solr server is on")


def turn_off_server(self):
def stop_server(self):
subprocess.run("solrini/bin/solr stop -all",shell=True)

### we consider 9983 as default local port number
### ignore it if you receive message: Collection 'xxx' already exist. It just means you previouly build
### this collection on solr already, re-build does not affect anything
def indexing(self,name,input_path,thread_num):
# We consider 9983 as default local port number
# If this collection already exists on solr and has the correct number of indexed documents from the previous test,
# we ask the user whether to overwrite it
def index(self,name,input_path,thread_num):
thread_num = str(thread_num)
# If this collection already exists, then this path can be found
collection_location = "solrini/server/solr/" + name + "_shard1_replica_n1"

# If collection already exists on Solr server,we check whether previous executions have the correct number
# of indexed doc. If true, we execute user's command, if false, we overwrite the collection
if (os.path.exists(collection_location)):
if (self.check_indexing() and self.skip_index == "True" ):
print("We do not overwrite the existing collection,directly jump to Searching step")
return
print("We overwrite the existing collection {} ".format(name))
command0 = "solrini/bin/solr delete -c {}".format(name)
self.command_execution("solrini/bin/solr", command0)

print("Create data collection on Solr")
command1 = "solrini/bin/solr create -n anserini -c {}".format(name)
self.command_execution(command1)
self.command_execution("solrini/bin/solr", command1)

collection, generator = index_options[name]["collection"],index_options[name]["generator"]
print("\nsolr indexing start for {}, {}, {}, {}, {} \n".format(collection,generator,name,input_path,thread_num))
command2 = """sh target/appassembler/bin/IndexCollection -collection {} -generator {} -threads {} -input {} -solr -solr.index {} -solr.zkUrl localhost:9983 -storePositions -storeDocvectors -storeTransformedDocs""".format(collection,generator,thread_num,input_path,name)
self.command_execution(command2)
print("\nindexing complete\n")
command2 = "sh target/appassembler/bin/IndexCollection -collection {} -generator {} -threads {} -input {} \
-solr -solr.index {} -solr.zkUrl localhost:9983 -storePositions -storeDocvectors -storeTransformedDocs \
".format(collection,generator,thread_num,input_path,name)
self.command_execution("target/appassembler/bin/IndexCollection", command2)

# check whether index result is as expected
if (not self.check_indexing()): raise Exception("Indexing result is not as expected")

# check whether we have the correct number of indexed documents after indexing step
def check_indexing(self):
indexing_result = requests.get("http://localhost:8983/solr/core18/query?q=*:*", auth= ('user','pass')).json()
print("Expected indexed number: ",index_options[self.name]["expected_indexed_doc"], "actual indexed number",
indexing_result["response"]["numFound"])
total_indexed_doc = int(indexing_result["response"]["numFound"])
return total_indexed_doc == int(index_options[self.name]["expected_indexed_doc"])

### we consider 8983 as default local port number
def searching(self,name,topic_reader, topic_path):
output_path = "run.solr.{}.bm25.topics.{}.301-450.601-700.txt".format(name,name)
command = "sh target/appassembler/bin/SearchSolr -topicreader {} -solr.index {} -solr.zkUrl localhost:9983 -topics {} -output run.solr.{}.bm25.topics.{}.301-450.601-700.txt""".format(topic_reader,name,topic_path,name,name)
self.command_execution(command)
# we consider 8983 as default local port number
def search(self,name,topic_reader, topic_path):
command = "sh target/appassembler/bin/SearchSolr -topicreader {} -solr.index {} -solr.zkUrl localhost:9983 -topics {} \
-output {} ".format(topic_reader,name,topic_path,self.output_path)
self.command_execution("target/appassembler/bin/SearchSolr", command)
print("searching complete")
print("output file path: ", output_path)
return output_path



def trec_eval(self,qrel_path, output_path):
print("\nWe start trec_eval: \n")
command = "eval/trec_eval.9.0.4/trec_eval -m map -m P.30 {} {}".format(qrel_path,output_path)
self.command_execution(command)

#### run solr indexing,search on series of data colletion
self.command_execution("eval/trec_eval.9.0.4/trec_eval", command)
save_result = command + " > this_is_temp_file.txt"
subprocess.run(save_result , shell = True)
keywords = ["map", "P_30"]
res = read_file_into_list(keywords)
MAP,P_30 = res[0].split()[-1], res[1].split()[-1]
idea_MAP,idea_P_30 = index_options[self.name]["MAP"],index_options[self.name]["P30"]
if (MAP != idea_MAP): raise Exception("MAP: {}, which is not equal to expected MAP value {} ".format(MAP,idea_MAP))
if (P_30 != idea_P_30): raise Exception("P_30: {}, which is not equal to expected P_30 value {} ".format(P_30,idea_P_30))
print("MAP and P_30 are as expected ")
if (os.path.exists("this_is_temp_file.txt")) :os.remove("this_is_temp_file.txt")


# Integrate solr indexing,searching and trec_eval together
def data_collection_process(self,name):
###It required 8 parameter: collectin_name,input_path,thread_num,topic_reader,topic_path,qrel_path
### collection, generator
input_path, thread_num = index_options[name]["input_path"],index_options[name]["thread_num"]
collection, generator = index_options[name]["collection"],index_options[name]["generator"]
# These methods require 8 parameters: collectin_name,input_path,thread_num,topic_reader,topic_path,qrel_path
# collection, generator
input_path, thread_num = self.input_path, index_options[name]["thread_num"]
topic_reader = index_options[name]["topic_reader"]
topic_path,qrel_path = index_options[name]["topic_path"],index_options[name]["qrel_path"]
print('Input: ', name, input_path, thread_num,topic_reader)
print("topic path: ", topic_path)
print("qrel path: ", qrel_path)
print("Input: ", name, input_path, thread_num,topic_reader)

self.indexing(name,input_path,thread_num)
output_path = self.searching(name, topic_reader, topic_path)
self.trec_eval(qrel_path, output_path)
# Check whether collection data was indexed before
self.index(name,input_path,thread_num)
self.search(name, topic_reader, topic_path)
self.trec_eval(qrel_path, self.output_path)


if __name__ == "__main__":
solr = None
solr,collection_name,skip_index, solr_off = None, None, False, False
skip_start_server= False
try:
print(sys.argv)
solr = Solr_command(sys.argv[1:])
except:
sys.exit("Invalid input")
parser = argparse.ArgumentParser(description="Solr regression test.")
parser.add_argument("-collection", type = str, help = "Name of collection to be processed by Solr,this \
input is required!")
parser.add_argument("-input_path",type = str, help = "Path of data collection, this input is required!")
parser.add_argument("-skip_start_server",type = str, help = "If you already start server, input \
True so we do not need to restart server again")
parser.add_argument("-skip_index",type = str, help = "If colletion aleady exists, input True if you want \
to skip indexing step")
parser.add_argument("-solr_off_after_testing", type=str,help="Input True if you want to stop solr \
server after data processing")
args = parser.parse_args()
collection_name = args.collection
input_path = args.input_path
skip_index = args.skip_index
solr_off,skip_start_server = args.solr_off_after_testing, args.skip_start_server
solr = SolrClient(collection_name,input_path,skip_index)
except Exception as e:
print(e)
sys.exit("""Invalid input! Run \"python solr_integration_test.py -h\" for further information""")

"""
it take all these step to finish server setup,document indexing,data retrieval,evaluation with BM25,
execute it one by one
It takes all these steps to finish server setup,document indexing,data retrieval and evaluation with BM25
"""
solr.turn_on_server()
solr.data_collection_process(solr.name)
###solr.turn_off_server()

if (skip_start_server != "True"): solr.start_server()

try: solr.data_collection_process(solr.name)
except Exception as e:print(e)


if (solr_off == "True"):
print("Turn off Solr server")
solr.stop_server()



0 comments on commit c4f3514

Please sign in to comment.