diff --git a/engine/base_client/search.py b/engine/base_client/search.py index 764c0f17..1ca8cb7b 100644 --- a/engine/base_client/search.py +++ b/engine/base_client/search.py @@ -20,7 +20,7 @@ class BaseSearcher: - _doc_id_counter = itertools.count(1000000) + _doc_id_counter = itertools.count(1000000000) # Start from 1 billion to avoid conflicts MP_CONTEXT = None def __init__(self, host, connection_params, search_params): @@ -73,6 +73,9 @@ def _insert_one(cls, query): # Generate unique doc_id here doc_id = next(cls._doc_id_counter) + + # Debug logging to verify inserts are happening + #print(f"DEBUG: Inserting vector with doc_id={doc_id}") cls.insert_one(str(doc_id), query.vector, query.meta_conditions) end = time.perf_counter() @@ -266,10 +269,19 @@ def worker_function(self, distance, search_one, insert_one, chunk, result_queue, def process_chunk(chunk, search_one, insert_one, insert_fraction): results = [] + insert_count = 0 + search_count = 0 + + #print(f"DEBUG: Processing chunk of {len(chunk)} queries with insert_fraction={insert_fraction}") + for i, query in enumerate(chunk): if random.random() < insert_fraction: result = insert_one(query) + insert_count += 1 else: result = search_one(query) + search_count += 1 results.append(result) + + #print(f"DEBUG: Chunk complete - {search_count} searches, {insert_count} inserts") return results \ No newline at end of file diff --git a/engine/clients/redis/search.py b/engine/clients/redis/search.py index 61f020a0..f993765e 100644 --- a/engine/clients/redis/search.py +++ b/engine/clients/redis/search.py @@ -112,9 +112,39 @@ def insert_one(cls, doc_id: int, vector, meta_conditions): else: vec_param = vector - doc = {"vector": vec_param} - if meta_conditions: - for k, v in meta_conditions.items(): - doc[k] = str(v) + # Process metadata exactly like upload_batch does + meta = meta_conditions if meta_conditions else {} + geopoints = {} + payload = {} + + if meta is not None: + for k, v in meta.items(): + # This is a patch for arxiv-titles dataset where we have a list of "labels", and + # we want to index all of them under the same TAG field (whose separator is ';'). + if k == "labels": + payload[k] = ";".join(v) + if ( + v is not None + and not isinstance(v, dict) + and not isinstance(v, list) + ): + payload[k] = v + # Redis treats geopoints differently and requires putting them as + # a comma-separated string with lat and lon coordinates + from engine.clients.redis.helper import convert_to_redis_coords + geopoints = { + k: ",".join(map(str, convert_to_redis_coords(v["lon"], v["lat"]))) + for k, v in meta.items() + if isinstance(v, dict) + } - cls.client.hset(str(doc_id), mapping=doc) + #print(f"DEBUG: Redis inserting doc_id={doc_id}, vector_size={len(vec_param)} bytes") + cls.client.hset( + str(doc_id), + mapping={ + "vector": vec_param, + **payload, + **geopoints, + }, + ) + #print(f"DEBUG: Redis insert complete for doc_id={doc_id}")