diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d20b64 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +*.pyc diff --git a/searchify/__init__.py b/searchify/__init__.py new file mode 100644 index 0000000..10ec2a7 --- /dev/null +++ b/searchify/__init__.py @@ -0,0 +1,15 @@ +# Note that much of the time, something that's a singleton (eg: a string) can also be an iterable. Where this isn't the case, it probably should be. +# In your search field names, don't start with an underscore ('_') as that's reserved. +# We STRONGLY recommend explicitly declaring your search field names, as it makes the resultant search system more useful to users. In some cases you don't need it, but that's rare. +# Note that strictly you can have a callable as a django_field directly. In this case, it will be called with a parameter of None to generate the search field name (well, part of it - it only needs to be unique to the class). But don't do this, it's ugly. +# When auto-generating, we use '.' to separate bits of things where possible, and '__' where we require \w only. + +# FIXME: document the query() method added to managers +# FIXME: make query() do pagination properly, on top of anything Flax chooses to offer us (currently Flax gives us nothing) +# FIXME: detect and resolve circular cascades + +# TODO: make it possible to index an individual model to more than one database. (Probably multiple explicit indexers.) +# TODO: reverse cascades, so you can put searchable stuff into your Profile model, but have it index stuff from the User. (Also just easier in general, although I can't see how to make it as powerful as normal cascades.) +# TODO: if you change the django_fields, searchify should "want" to reindex, with suitable options to tell it not to; perhaps a hash of the config (and allow it to be set explicitly for people who want to manage this themselves) + +from index import register_indexer, autodiscover, reindex, Indexer, get_searcher diff --git a/searchify/clients/__init__.py b/searchify/clients/__init__.py new file mode 100644 index 0000000..d0a56d9 --- /dev/null +++ b/searchify/clients/__init__.py @@ -0,0 +1,30 @@ +"""This module contains the clients which talk to search systems. + +Currently there is decent support for pyes, and some old and flaky support for +Xappy and Flax. + +""" + +from django.conf import settings +from django.core.exceptions import ImproperlyConfigured + +import os.path + + +def import_client(engine): + """Import a lib.searchify client.""" + + mod = __import__(engine + '_client', globals(), locals(), + fromlist=['Client'], level=1) + return mod.Client + + +if hasattr(settings, 'ENABLE_SEARCHIFY') and settings.ENABLE_SEARCHIFY: + engine = getattr(settings, "SEARCHIFY_ENGINE", None) + if engine is None: + raise ImproperlyConfigured('No engine configured for searchify: ' + 'specify settings.SEARCHIFY_ENGINE') + + Client = import_client(engine) +else: + Client = import_client('unconfigured') diff --git a/searchify/clients/flax_client.py b/searchify/clients/flax_client.py new file mode 100644 index 0000000..619fd51 --- /dev/null +++ b/searchify/clients/flax_client.py @@ -0,0 +1,68 @@ +"""Searchify client using Flax as a backend. + +Status: dubious - use with caution. + +""" + +from django.conf import settings +from flax.searchclient import Client, FlaxError + +def ClientFactory(dbname): + personal_prefix = getattr(settings, "FLAX_PERSONAL_PREFIX", "") + return FlaxClient(personal_prefix + dbname, settings.FLAX_BASE_URL) + +class FlaxClient: + Error = FlaxError + def __init__(self, dbname, url): + self.client = Client(url) + self.dbname = dbname + + def create(self, fields, reopen=False, overwrite=False): + # print "making db" + self.client.create_database(self.dbname, reopen=reopen, overwrite=overwrite) + # print "done." + schema = self.schema() + for f,c in fields.items(): + # print "adding field %s (%s)" % (f, c) + c = dict(c) + if c.has_key("freetext") and c['freetext']: + if c['freetext'].has_key("weight"): + c['freetext']['term_frequency_multiplier'] = c['freetext']['weight'] + del c['freetext']['weight'] + schema.add_field(f, c) + + def schema(self): + return self.client.schema(self.dbname) + + def add(self, doc, docid=None): + db = self.client.db(self.dbname) + # print "adding doc", + ret = db.add_document(doc, docid) + # print "done." + return ret + + def search(self, query, query_filter=None, start=0, end=10): + if filter: + return self.client.db(self.dbname).search_structured(query_any=query, filter=query_filter, start_rank=start, end_rank=end) + else: + return self.client.db(self.dbname).search_simple(query, start, end) + + def get_searcher(self): + """ + Return some sort of useful searching object. + """ + return self.client.db(self.dbname) + + def delete(self, uid): + db = self.client.db(self.dbname) + ret = db.delete_document(uid) + return ret + + def flush(self): + db = self.client.db(self.dbname) + db.flush() + + def close(self): + self.flush() + self.client.close() + self.client = None diff --git a/searchify/clients/pyes_client.py b/searchify/clients/pyes_client.py new file mode 100644 index 0000000..047aa2d --- /dev/null +++ b/searchify/clients/pyes_client.py @@ -0,0 +1,405 @@ +"""Client for elasticsearch, using pyes. + +To enable this client in the django config, set SEARCHIFY_ENGINE to 'pyes', and +set ENABLE_SEARCHIFY to True. + +This client uses two settings from the django config: + + - `PYES_ADDRESS` (required, a string): The address to contact to talk to + elasticsearch. Typically, this will be of the form 'hostname:port'; + elasticsearch usually listens on port 9200 for the HTTP transport, or on + port 9500 for the thrift transport. + + - `PYES_PERSONAL_PREFIX` (optional, a string, defaults to ""). A prefix which + will be added to all indexnames used. This can be used to allow multiple + users to use the same elasticsearch cluster without interfering with each + other - this is particularly useful in development environments where you + don't wish to require all users to run an elasticsearch server. + +""" + +import copy +from django.conf import settings +import lib.searchify +import pyes +import pyes.exceptions + +personal_prefix = getattr(settings, "PYES_PERSONAL_PREFIX", "") + +class Client(object): + """Client to talk to the pyes backend. + + This can be used to get an indexer for performing index actions on a + specific index, or to get a searcher for performing search actions (either + on a specific index, or a general searcher which searches all known + indexes). + + This reads the necessary settings from the Django config. + + """ + def __init__(self): + self.conn = pyes.ES(settings.PYES_ADDRESS, timeout=30) + + def indexer(self, indexname): + """Get an indexer for a given index name. + + """ + return IndexerClient(self, indexname) + + def searcher(self, indexname): + """Get a searcher for a given index name. + + """ + return PyesSearchQS(self, indexname) + + def all_indexes(self): + """Return a dict with information on all known indexes. + + (If a personal_prefix was supplied, only indexes undex this prefix are + shown, and the prefix is stripped from them.) + + Returns a dict, keyed by indexname, in which the values are a dict with + the following properties: + + - num_docs: number of docs in the index. + - alias_for: if this indexname is an alias, a list of the indexes it + is an alias for. + + """ + indices = self.conn.get_indices(include_aliases=True) + res = {} + for index, info in indices.iteritems(): + if not index.startswith(personal_prefix): + continue + index = index[len(personal_prefix):] + + newinfo = dict(num_docs = info['num_docs']) + if 'alias_for' in info: + aliases = [] + for alias in info['alias_for']: + if alias.startswith(personal_prefix): + alias = alias[len(personal_prefix):] + aliases.append(alias) + newinfo['alias_for'] = aliases + + res[index] = newinfo + return res + + def get_alias(self, alias): + """Get a list of the indexes pointed to by an alias. + + Returns an empty list if the alias does not exist. + + """ + try: + result = [] + for indexname in self.conn.get_alias(personal_prefix + alias): + if indexname.startswith(personal_prefix): + indexname = indexname[len(personal_prefix):] + result.append(indexname) + return result + except pyes.exceptions.IndexMissingException: + return [] + + def delete_index(self, indexname): + """Delete the named index (or alias). + + If the index is not found, does not raise an error. + + """ + self.conn.delete_index_if_exists(personal_prefix + indexname) + self.conn.set_alias(personal_prefix + indexname, []) + + def set_alias(self, alias, indexname): + """Set an alias to point to an index. + + """ + self.conn.set_alias(personal_prefix + alias, personal_prefix + indexname) + + def flush(self): + """Flush all changes made by the client. + + This forces all bulk updates to be sent to elasticsearch, but doesn't + force a "refresh", so it may take some time after this call for the + updates to become searchable. + + """ + self.conn.flush() + + def close(self): + """Close the client. + + """ + self.flush() + self.conn.connection.close() + self.conn = None + + +class IndexerClient(object): + def __init__(self, client, indexname): + self.client = client + self.indexname = indexname + self.suffix = '' + self._target_name = None + self._set_target_name() + + def set_suffix(self, suffix=''): + """Set a suffix to be appended to the index name for all subsequent + operations. + + This is used during reindexing to direct all updates to a new index. + + """ + self.suffix = suffix + self._set_target_name() + + def _set_target_name(self): + self._target_name = personal_prefix + self.indexname + self.suffix + + def create_index(self, index_settings): + self.client.conn.create_index(self._target_name, index_settings) + + def set_mapping(self, doc_type, fields): + """Create the index, and add settings for a given doc_type, with + specified field configuration. + + """ + try: + self.client.conn.put_mapping(doc_type, dict(properties=fields), self._target_name) + except pyes.exceptions.MapperParsingException, e: + raise ValueError("Could not parse mapping supplied to index %r " + "for type %r: %s" % (self.indexname, doc_type, e)) + + def get_mapping(self, doc_type): + """Get the mapping for a given doc_type. + + """ + try: + mapping = self.client.conn.get_mapping(doc_type, self._target_name) + except pyes.exceptions.ElasticSearchException: + return None + if settings.ES_VERSION < 0.16: + try: + mapping = mapping[self._target_name] + except KeyError: + aliases = self.client.get_alias(self.indexname) + if len(aliases) == 0: + return None + try: + mapping = mapping[aliases[0]] + except KeyError: + return None + + try: + return mapping[doc_type]['properties'] + except KeyError: + return None + + def add(self, doc, doc_type, docid): + """Add a document of the specified doc_type and docid. + + Replaces any existing document of the same doc_type and docid. + + """ + self.client.conn.index(doc, self._target_name, doc_type=doc_type, id=docid, bulk=True) + + def delete(self, doc_type, docid): + """Delete the document of given doc_type and docid. + + Doesn't report an error if the document wasn't found. + + """ + try: + self.client.conn.delete(self._target_name, doc_type=doc_type, id=docid) + except pyes.exceptions.NotFoundException: + pass + + def flush(self): + """Flush all changes made by the client. + + This forces all bulk updates to be sent to elasticsearch, but doesn't + force a "refresh", so it may take some time after this call for the + updates to become searchable. + + """ + self.client.flush() + +class SearchQS(object): + """A simple wrapper around a query and the parameters which will be used + for a search, to allow a search to be built up easily. + + """ + # FIXME - this should really be backend independent, but the execute method + # implemented here isn't yet. + def __init__(self, client, indexname): + self._client = client + self._indexname = personal_prefix + indexname + self._doc_types = set() + + def clone(self): + """Clone method, used when chaining. + + """ + return copy.copy(self) + + def for_type(self, type): + """Return a new SearchQS which searches only for a specific type. + + Multiple types may be specified by passing a sequence instead of a + single string. + + Any previous types searched for by this SearchQS are dropped. + + """ + result = self.clone() + result._doc_types = set() + if isinstance(type, basestring): + result._doc_types.add(type) + else: + for t in type: + result._doc_types.add(type) + return result + + def execute(self, **kwargs): + """Perform the search, and return a result set object. + + """ + raise NotImplementedError("Subclasses should implement this") + +class PyesSearchQS(SearchQS): + """A client for building searches. + + """ + def __init__(self, client, indexname): + super(PyesSearchQS, self).__init__(client, indexname) + self._query = None + self._facets = [] + self.query_params = {} + + def execution_type(self, type): + """Set the query execution type. + + The default is set by elasticsearch, but in the 0.15 release is + query_then_fetch, which runs the query on each shard without first + sharing global statistics. + + An alternative is dfs_query_then_fetch, which first shares globabl + statistics. There are also _and_fetch variants, which return "size" + results from each shard, to cut down on communication. + + """ + result = self.clone() + assert type in ('query_and_fetch', + 'query_then_fetch', + 'dfs_query_and_fetch', + 'dfs_query_then_fetch', + ) + self.query_params['search_type'] = type + return result + + def add_facet(self, facet): + self._facets.append(facet) + + def parse(self, query_string, *args, **kwargs): + """Construct a search query by parsing user input. + + """ + result = self.clone() + result._query = pyes.StringQuery(query_string, *args, **kwargs) + return result + + def flt(self, fields, text, *args, **kwargs): + result = self.clone() + result._query = pyes.FuzzyLikeThisQuery(fields, text, *args, **kwargs) + return result + + def text_query(self, query_string, *args, **kwargs): + """Construct a text search query by parsing user input. + + """ + result = self.clone() + result._query = pyes.TextQuery(query_string, *args, **kwargs) + return result + + def field_parse(self, field, query_string, *args, **kwargs): + """Construct a search in a field, by parsing user input. + + """ + result = self.clone() + q = pyes.FieldQuery() + q.add(field, query_string, *args, **kwargs) + result._query = q + return result + + def custom_score(self, script, *args, **kwargs): + """Apply a custom weight to the query. + + """ + result = self.clone() + result._query = pyes.CustomScoreQuery(result._query, script, + *args, **kwargs) + return result + + def dis_max(self, queries, **kwargs): + result = self.clone() + _queries = [] + for q in queries: + if isinstance(q, PyesSearchQS): + _queries.append(q._query) + else: + _queries.append(q) + result._query = pyes.query.DisMaxQuery(_queries, **kwargs) + return result + + def execute(self, **kwargs): + search = self._query.search(**kwargs) + search.facet.facets = self._facets + response = self._client.conn.search(search, + (self._indexname,), + tuple(sorted(self._doc_types)), + **self.query_params) + return SearchResultSet(response, search) + +class SearchResult(object): + """An individual search result. + + """ + def __init__(self, type, pk, score, hit): + self.type = type + self.pk = pk + self.score = score + self.hit = hit + +class SearchResultSet(object): + def __init__(self, response, search): + self.start_rank = search.start + self.requested_size = search.size + self.response = response + self.search = search + try: + hits = response['hits'] + except KeyError: + hits = {} + self._hits = hits.get('hits', []) + try: + facets = response['facets'] + except KeyError: + facets = {} + self._facets = facets + self.count = hits.get('total', 0) + self.more_matches = (self.count > self.start_rank + self.requested_size) + + def __len__(self): + """Get the number of result items in this result set. + + """ + return len(self._hits) + + @property + def results(self): + for hit in self._hits: + pk = long(hit['_id']) + type = lib.searchify.utils.lookup_model(hit.get('_type')) + if type is None: + raise Exception("Model %s not found" % hit.get('_type')) + yield SearchResult(type, pk, hit.get('_score', 0), hit) diff --git a/searchify/clients/unconfigured_client.py b/searchify/clients/unconfigured_client.py new file mode 100644 index 0000000..4d61131 --- /dev/null +++ b/searchify/clients/unconfigured_client.py @@ -0,0 +1,10 @@ +"""An empty class used to throw helpful exceptions when a search client isn't configured.""" + +from django.core.exceptions import ImproperlyConfigured + +class Client(object): + """Throws exceptions on attempts to use this class.""" + + def __getattr__(self, _): + raise ImproperlyConfigured( + "Search client not configured. Start with settings.ENABLE_SEARCHIFY") \ No newline at end of file diff --git a/searchify/clients/xappy_client.py b/searchify/clients/xappy_client.py new file mode 100644 index 0000000..8d2de0f --- /dev/null +++ b/searchify/clients/xappy_client.py @@ -0,0 +1,102 @@ +"""Searchify client using Xappy as a backend. + +Status: dubious - use with caution. + +""" + +from django.conf import settings + +import xappy + +def ClientFactory(index): + return XappyClient(index) + +class XappyClient(object): + """ + Simple wrapper around Xappy that makes my life a little easier. + """ + Error = Exception + type = "xappy" + + def __init__(self, index): + self.dbname = index + self.index = None + + def get_index(self): + if not self.index: + self.index = xappy.IndexerConnection(os.path.join(getattr(settings, 'XAPPY_DB_ROOT', ''), self.dbname)) + return self.index + + def create(self, fields, reopen=False, overwrite=False): + idx = self.get_index() + for name, config in fields.items(): + # config format: + # + # store (bool, STORE_CONTEXT action) + # sortable (bool, SORTABLE, COLLAPSE, WEIGHT actions) + # type (text, geo, date, float) + # freetext (dict containing INDEX_FREETEXT options weight, language, stop, spell, nopos, allow_field_specific, search_by_default) + # exacttext (alternative to freetext) + # facet (bool, FACET, type text->string, float->float) + # geo (bool) + # image (dict, approach -> imgseek) + # colour / color (bool) + # If no other actions apply, freetext will happen automatically. + + type = config.get('type', 'text') + + processed_action = False + if config.get('store', False): + idx.add_field_action(name, xappy.FieldActions.STORE_CONTENT) + processed_action = True + if config.get('sortable', False): + if type=='float' and config.has_key('ranges'): + idx.add_field_action(name, xappy.FieldActions.SORTABLE, type=type, ranges=config['ranges']) + else: + idx.add_field_action(name, xappy.FieldActions.SORTABLE, type=type) + idx.add_field_action(name, xappy.FieldActions.COLLAPSE) + if type=='float': + idx.add_field_action(name, xappy.FieldActions.WEIGHT) + processed_action = True + if config.get('facet', False): + if type=='float': + t = 'float' + else: + t = 'string' + idx.add_field_action(name, xappy.FieldActions.FACET, type=t) + processed_action = True + if config.get('geo', False): + idx.add_field_action(name, xappy.FieldActions.GEOLOCATION) + processed_action = True + if config.get('image', False): + if config['image'].get('approach')=='imgseek': + idx.add_field_action(name, xappy.FieldActions.IMGSEEK) + processed_action = True + if config.get('colour', False) or config.get('color', False): + idx.add_field_action(name, xappy.FieldActions.COLOUR) + processed_action = True + if config.get('exacttext', False): + idx.add_field_action(name, xappy.FieldActions.INDEX_EXACT) + elif config.get('freetext', False) or not processed_action: + idx.add_field_action(name, xappy.FieldActions.INDEX_FREETEXT, **config.get('freetext', {})) + + def add(self, fielddata, docid): + doc = xappy.UnprocessedDocument() + for name, values in fielddata.items(): + if type(values) in (str, unicode): + values = [values] + for v in values: + doc.fields.append(xappy.Field(name, v)) + doc.id = docid + self.get_index().add(doc) + + def delete(self, docid): + self.get_index().delete(docid) + + def flush(self): + self.get_index().flush() + + def close(self): + self.flush() + self.get_index().close() + self.index = None diff --git a/searchify/hooks.py b/searchify/hooks.py new file mode 100644 index 0000000..4da6841 --- /dev/null +++ b/searchify/hooks.py @@ -0,0 +1,34 @@ +"""Hooks to ensure that the indexer is informed when an indexed instance +changes. + +""" + +from django.db.models.signals import post_save, pre_delete, post_delete + +from index import get_indexer + +def connect_signals(): + post_save.connect(index_hook) + pre_delete.connect(delete_hook) + +def index_hook(sender, **kwargs): + instance = kwargs['instance'] + indexer = get_indexer(instance) + if indexer: + indexer.index_instance(instance) + +def delete_hook(sender, **kwargs): + instance = kwargs['instance'] + indexer = get_indexer(instance) + if indexer: + indexer.delete(instance) + post_delete.connect(post_delete_hook(instance)) + +def post_delete_hook(instance): + def hook(sender, **kwargs): + if kwargs['instance'] == instance: + indexer = get_indexer(instance) + if indexer: + indexer.cascade(instance) + post_delete.disconnect(hook) + return hook diff --git a/searchify/index.py b/searchify/index.py new file mode 100644 index 0000000..241ee09 --- /dev/null +++ b/searchify/index.py @@ -0,0 +1,455 @@ +""" +Searchify works by having a suitable subclass of Searchable available for each model we care about. + +This can be set up by one of: + + - Place a nested class in the model class called `Indexer`. (A single instance + will be created and registered for the model.) + - Call `register_indexer` directly with an instance. + +""" + +import copy +import sys +import time + +from django.db import models +from django.conf import settings + +import search # for make_searcher +from clients import Client +from utils import get_indexer, get_searcher, get_typename_from_object + +client = Client() + +class SearchifyOptions(object): + def __init__(self, indexer=None): + self.indexer = indexer + +# Map search index -> list of models which have an indexer for that index. +_index_models = {} + +def register_indexer(model, indexer): + """Register an indexer on a model. + + This stores the indexer in a property on the model. + """ + + if not hasattr(settings, 'ENABLE_SEARCHIFY') or not settings.ENABLE_SEARCHIFY: + return + + if not hasattr(model, '_searchify'): + model._searchify = SearchifyOptions() + model._searchify.indexer = indexer + indexer.model = model + if indexer.index: + _index_models.setdefault(indexer.index, []).append(model) + + # indexer.managers is a list of attribute names (eg: ['objects']) for managers we want to + # decorate + for manager in indexer.managers: + manager = getattr(model, manager) + manager.query = indexer.make_searcher(manager) + +_ensure_dbs_exist = True +def autodiscover(verbose=None, ensure_dbs_exist=None): + """Automatically register all models with Indexer subclasses. + + Romp through the models, creating instances of their Indexer subclasses as + required. + + This must be called, even if you've registered the indexers yourself, since + it sets up the search databases. + """ + + # if we get called once with ensure_dbs_exist=False, then we don't want + # to ensure them later. This is an incredible hack to avoid having to put + # autodiscovery into urls.py and still make it practical to have a simple + # reindex command. + # + # Note we do it here so that commands that want to can defer model validation; + # if we leave it until after the call to models.get_models() then model + # validation will have happened and we'll have been called from lib.searchify.models. + global _ensure_dbs_exist + if ensure_dbs_exist is None: + ensure_dbs_exist = _ensure_dbs_exist + _ensure_dbs_exist = ensure_dbs_exist + + if not hasattr(settings, 'ENABLE_SEARCHIFY') or not settings.ENABLE_SEARCHIFY: + return + + for model in models.get_models(): + if not hasattr(model, '_searchify') and hasattr(model, 'Indexer'): + # auto-register + if verbose: + verbose.write("Auto-creating indexer instance for class %s\n" % + model) + register_indexer(model, model.Indexer(model)) + + if not ensure_dbs_exist: + return + + # Now loop through and ensure each model has a mapping. + for index, modellist in _index_models.items(): + for model in modellist: + indexer = get_indexer(model) + if indexer.get_current_mapping() is None: + print >>sys.stderr, ("Mapping not stored for %r - need to run " + "reindex command" % indexer.get_typename(model)) + del _index_models[index] + break + +def reindex(indices): + """Reindex the named indices, or all indices if none are named. + + The index is rebuilt from scratch with a new suffix, and the alias is then + changed to point to the new index, so existing searchers should not be + disrupted. + """ + + if not hasattr(settings, 'ENABLE_SEARCHIFY') or not settings.ENABLE_SEARCHIFY: + return + + suffix = '_' + hex(int(time.time()))[2:] + if not indices: + indices = _index_models.keys() + for indexname in indices: + reindex_index(indexname, suffix) + +def reindex_index(indexname, suffix): + """Reindex a named index. + """ + + if not hasattr(settings, 'ENABLE_SEARCHIFY') or not settings.ENABLE_SEARCHIFY: + return + + models = _index_models.get(indexname, None) + if models is None: + raise KeyError("Index %r is not known" % indexname) + try: + + # Get the index-wide settings. + index_settings = {} + def merge_dicts(path, a, b): + for (k, v) in b.iteritems(): + if k not in a: + a[k] = v + continue + if isinstance(v, dict): + merge_dicts('%s.%s' % (path, k), a[k], v) + continue + if a[k] == v: + continue + raise ValueError("Conflicting values in index_settings (at %s)" % path[1:]) + for model in models: + indexer = get_indexer(model) + merge_dicts('.', index_settings, indexer.index_settings) + + created = False + for model in models: + print "Indexing %s to %s, using suffix %s" % (model, indexname, suffix) + indexer = get_indexer(model) + try: + indexer.client.set_suffix(suffix) + if not created: + #print "Creating index with settings %r" % index_settings + indexer.client.create_index(index_settings) + created = True + indexer.apply_mapping() + indexer.index_all(with_cascade=False) + finally: + indexer.client.set_suffix() + indexer.client.flush() + + # Get the old value of the alias. + try: + old_index = client.get_alias(indexname)[0] + except IndexError: + old_index = None + if old_index == indexname: + # Old index wasn't an alias; we have to delete it and then set the + # new alias for it. + print "Warning: no alias in use, so must delete in-use index" + old_index = None + client.delete_index(indexname) + print "Setting alias to make new index live" + client.set_alias(indexname, indexname + suffix) + except: + try: + client.delete_index(indexname + suffix) + except Exception: + # Ignore any normal exceptions, so we report the original error. + pass + raise + if old_index: + print "Removing old index: %s" % old_index + client.delete_index(old_index) + +class Indexer(object): + """Main indexer superclass, controlling search indexing for a model. + + Most of the indexing behaviour is in here (except for the index clients). + + Typically you won't have to do much here, just subclass and set index, + fields and perhaps cascades. However you can override any part of the + indexing process if needed. + + """ + + # index is the name of the index that this model should be indexed to. + # The default is None, meaning that the model will not be indexed. + index = None + + # Fields is a list of fields to be indexed. + index_settings = {} # A dictionary of engine specific index-level settings. + fields = [] + cascades = [] # no cascades + managers = [] # don't create searcher by default (still pondering details, and searchers unported to class approach) + defaults = {} + + def __init__(self, model): + self.model = model + if self.index: + self.client = client.indexer(self.index) + + def reindex_on_cascade(self, cascade_from, cascade_to): + """ + Should we reindex cascade_to when we've just reindexed cascade_from? + Called on the indexer for cascade_to. + """ + + return True + + def should_be_in_index(self, instance): + """ + Should we be in the index, based on the instance's current state? + + (For instance, this might want to return `not self.deleted`.) + """ + + return True + + def get_searcher(self): + return self.client.get_searcher() + + def index_all(self, with_cascade=True): + """Index or reindex all the instances of this model. + + If with_cascade is True, the cascade of instances depending on this + instance will also be traversed, to update any search data built from + these instances. + + """ + from django.db import connection + for inst in self.model.objects.all(): + self.index_instance(inst, with_cascade) + del inst + connection.queries = [] + self.client.flush() + + def index_instance(self, instance, with_cascade=True): + """Index or reindex an instance. + + If with_cascade is True, the cascade of instances depending on this + instance will also be traversed, to update any search data built from + these instances. + + """ + if self.index: + if not self.should_be_in_index(instance): + self.client.delete(self.get_typename(instance), + self.get_docid(instance)) + else: + dret = self.get_index_data(instance) + if dret is not None: + (doc_type, docid, fielddata) = dret + self.client.add(fielddata, doc_type=doc_type, docid=docid) + if with_cascade: + self.cascade(instance) + if self.index: + self.client.flush() + + def cascade(self, instance): + """Cascade the index from this instance to others that depend on it. + + This causes index_instance() to be called on each instance that depends + on the instance supplied. + + """ + for descriptor in self.cascades: + cascade_inst = None + # find the instance we're being told to cascade the reindex onto + try: + if callable(descriptor): + cascade_inst = descriptor(instance) + elif isinstance(descriptor, str): + cascade_inst = getattr(instance, descriptor) + except: + cascade_inst = None + # if we found one, check if it's searchable, check if it + # wants to accept the cascade, and if so, reindex it + if cascade_inst: + # If it's not an iterable already, make it into one + if not hasattr(cascade_inst, '__iter__'): + cascade_insts = [cascade_inst] + else: + cascade_insts = cascade_inst + for cascade_inst in cascade_insts: + indexer = get_indexer(cascade_inst) + if indexer and indexer.reindex_on_cascade(instance, cascade_inst): + indexer.index_instance(cascade_inst, with_cascade=False) + + def delete(self, instance): + """Delete an instance from the (relevant) search index. + + """ + + if self.index: + self.client.delete(self.get_typename(instance), + self.get_docid(instance)) + self.client.flush() + + def get_typename(self, instance): + """Generate a type name for use in the search database. + + Default is in the format '.' + + """ + return get_typename_from_object(instance) + + def get_docid(self, instance): + """Generate a docid for use as a search database identifier. + + Default is in the format ('.', ''). + + """ + return '%s' % (instance.pk, ) + + def get_index_data(self, instance): + """Get the data to be indexed for an instance. + + Given a Django model instance, return a unique identifier and a + dictionary of search fields mapping to lists of data, or None. + + """ + if not self.fields: + return None + + outfields = {} + + for field in self.fields: + (django_field_list, index_fieldname, index_config) = self.get_details(field) + # print "indexing %s (%s)" % (instance, index_fieldname,) + interim_data = map(lambda x: self.get_field_input(instance, x), django_field_list) + # print '>>>' + str(interim_data) + outfields[index_fieldname] = reduce(lambda x,y: list(x) + list(y), interim_data) + + return (self.get_typename(instance), self.get_docid(instance), + outfields) + + + def get_details(self, field): + """ + Return (django_field_list, search field name, config) for a particular search field (which is a string, dict or possibly callable). + + Performs auto-generation of search field names as needed (generally it's not advised not to do this with a plain callable; use a + dictionary, which makes things more clear). + + If the field is a dictionary, it will have: + django_fields list of django fields / callables + field_name single search field to index into + config additional config options pass through to indexing client on init + + If not, then django_fields is a list of it, and field_name is generated from: + field is str letters of field (eg: my_field -> myfield) + field is callable + field(None) + + (Note that all these fields are in self.fields.) + """ + + if type(field) is dict: + django_field_list = field['django_fields'] + index_fieldname = field.get('field_name') + else: + django_field_list = [field] + index_fieldname = None + + if index_fieldname == None: + if isinstance(django_field_list[0], str): + field_specific_name = django_field_list[0] + elif callable(django_field_list[0]): + field_specific_name = django_field_list[0](None) + index_fieldname = filter(lambda x: x.isalpha(), field_specific_name) + + if type(field) is dict: + return (django_field_list, index_fieldname, field.get('config', {})) + else: + return (django_field_list, index_fieldname, {}) + + + def get_field_input(self, instance, django_field): + """ + Given a single Django field descriptor (string or callable), generate a list of data to input to the search field. + + Converters allow Django ORM types to be modified automatically (eg: returning DateTimeField in a useful format). + Currently, converters are embedded here, which isn't helpful. + """ + + # must return an iterable; django_field is str (name) or callable + if isinstance(django_field, str): + #print 'trying as str' + #print '.name = %s' % instance.name + #print 'getattr(,"name") = %s' % getattr(instance, 'name') + val = getattr(instance, django_field) + + def datetime_converter(d): + return unicode(d.date()) + + # FIXME: converters should be on the class, not embedded in this method + converters = { models.DateTimeField: datetime_converter } + field_type = instance._meta.get_field(django_field).__class__ + if val == None: + return [] + if field_type in converters: + val = converters[field_type](val) + else: + val = unicode(val) + return [val] + elif callable(django_field): + return django_field(instance) + else: + return [] + + def get_configuration(self): + """Get the configuration for this indexer, by looking at self.fields. + + """ + fields = {} + for field in self.fields: + config = copy.deepcopy(self.defaults) + (_, search_fieldname, field_config) = self.get_details(field) + config.update(field_config) + fields[search_fieldname] = config + return fields + + def get_current_mapping(self): + """Get the current mapping for this indexer used by the search engine. + + """ + typename = self.get_typename(self.model) + return self.client.get_mapping(typename) + + def apply_mapping(self): + """Apply the configuration for this indexer to the search engine. + + """ + mapping = self.get_configuration() + typename = self.get_typename(self.model) + self.client.set_mapping(typename, mapping) + + def make_searcher(self, manager): + """Make a searcher for the given manager. + + """ + return search.make_searcher(manager, self.model) diff --git a/searchify/management/__init__.py b/searchify/management/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/searchify/management/commands/__init__.py b/searchify/management/commands/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/searchify/management/commands/searchify_reindex.py b/searchify/management/commands/searchify_reindex.py new file mode 100644 index 0000000..3890ea1 --- /dev/null +++ b/searchify/management/commands/searchify_reindex.py @@ -0,0 +1,39 @@ +from django.core.management.base import BaseCommand, CommandError +import lib.searchify +from optparse import make_option + +class Command(BaseCommand): + args = '[ ...]' + help = """Reindex specified index. + +Reindexes all indices if none specified. + +This clears the specified index and rebuilds it from scratch. + +In order to avoid causing search not to return appropriate results during the +reindexing, indexes are actually named with a suffix based on the creation +time, and an alias is set to point to this from the unsuffixed name. This +alias is updated after the indexing completes, and then the old index is +deleted. + +This means that searches will switch over the the new index only after a +successsful reindex. + + """.strip() + + requires_model_validation = False + + def __init__(self): + super(Command, self).__init__() + + def handle(self, *args, **kwargs): + """ + Note that we defer model validation until after we've run autodiscover. + This means we can call with ensure_dbs_exist=False (which is sticky), + so that we don't get an error for not having a mapping in the database + for anything that hasn't been indexed yet. + """ + + lib.searchify.autodiscover(ensure_dbs_exist=False) + self.validate() + lib.searchify.reindex(args) diff --git a/searchify/management/commands/searchify_show.py b/searchify/management/commands/searchify_show.py new file mode 100644 index 0000000..aa525d2 --- /dev/null +++ b/searchify/management/commands/searchify_show.py @@ -0,0 +1,41 @@ +from django.core.management.base import BaseCommand, CommandError +import lib.searchify +from optparse import make_option +import pprint + +class Command(BaseCommand): + args = '[ ...]' + help = """Show searchify configuration for specified indices. + +Shows configuration for all indicies if none specified. + + """.strip() + + def show_config(self, indices, verbose_out): + lib.searchify.autodiscover(verbose=verbose_out, ensure_dbs_exist=False) + index_models = lib.searchify.index._index_models + if not indices: + indices = index_models.keys() + + for indexname in indices: + self.stdout.write("Configuration for index %r\n" % indexname) + for model in index_models[indexname]: + self.stdout.write("From model: %s\n" % model) + + indexer = lib.searchify.utils.get_indexer(model) + for field, config in sorted(indexer.get_configuration().items()): + self.stdout.write(" - %s:\n" % field) + for k, v in sorted(config.items()): + self.stdout.write(" %s: %s\n" % (k , v)) + + if verbose_out: + verbose_out.write("Stored mapping:\n%s\n" % + pprint.pformat(indexer.get_current_mapping())) + self.stdout.write("\n") + + def handle(self, *args, **kwargs): + if kwargs.get('verbosity') == '2': + verbose_out = self.stdout + else: + verbose_out = None + self.show_config(args, verbose_out) diff --git a/searchify/models.py b/searchify/models.py new file mode 100644 index 0000000..e1ea2cf --- /dev/null +++ b/searchify/models.py @@ -0,0 +1,15 @@ +"""Initialisation for the searchify app. + +Despite the name, this file doesn't actually contain any models. It's just used +to contain initialisation for the searchify app, which is called at Django +setup time. + +""" + +from django.conf import settings +from hooks import connect_signals +from index import autodiscover + +if hasattr(settings, 'ENABLE_SEARCHIFY') and settings.ENABLE_SEARCHIFY: + connect_signals() + autodiscover() diff --git a/searchify/search.py b/searchify/search.py new file mode 100644 index 0000000..59579da --- /dev/null +++ b/searchify/search.py @@ -0,0 +1,169 @@ +# search-specific pieces + +def make_searcher(manager, model): + index = get_index(model) + if not index: + return None + + client = get_client(index) + def search(query=None, start=None, end=None, query_filter=None): + if start==None and end==None: + # new-style interface. Return an object which responds to slicing and is iterable. + class SearchResultSet: + def __init__(self, query, query_filter=None): + self.query = query + self.query_filter = query_filter + self.position = None + self.smallest_page = 10 + # the following are filled out as needed + self._results = None # just the current page of results (but contains useful metadata also) + self.results = {} # store of index -> decorated Django instance + + def __repr__(self): + return "" + + def __getattr__(self, key): + # print "__getattr__(%s)" % key + """Provide .-access to aspects of the result. eg: q.doc_count (providing the search provider returns doc_count).""" + self._ensure_results() + return getattr(self._results, key) + + def _ensure_results(self, start=None, end=None): + # print "_ensure_results(%s,%s)" % (start, end) + """ + Call this before any operation to ensure that we've got some useful results available. + If called with start=None, then ensure that the result at self.position is available. + If not, ensure that the result at [start] or results at [start:end] are available. + """ + if start==None: + start=0 + if end==None: + end=start+1 + # now reduce [start:end] to the single smallest range that we don't already have + while self.results.has_key(start) and start < end: + start += 1 + while self.results.has_key(end) and start < end: + end -= 1 + if start==end: + # we have everything we need already + return + if end-start < self.smallest_page: + end = start + self.smallest_page + + self._results = client.search(self.query, self.query_filter, start, end) + result_ids = [] + match_details = {} + for item in self._results.results: + search_id = item.docid + database_name, model_key, django_id = search_id.split('.') + if model_key != model.__name__: + # FIXME: not right! + # Either we need to get this right, or we could filter the query through a boolean to restrict to + # this model in the first place. The latter would work better, but requires some thought. (In particular, + # semi-constructing queries like this unsettles Richard, so there's probably a reason to avoid it.) + #continue + pass + result_ids.append(long(django_id)) + match_details[django_id] = item + bulk = manager.in_bulk(result_ids) + + for key, obj in bulk.items(): + # From Flax we get: data (dict of field->data pairs), id, rank + # We only really care about rank at this stage, as we've pulled out the object. + if hasattr(model.Searchable, 'match_details_attribute'): + match_attr = model.Searchable.match_details_attribute + else: + match_attr = 'match' + if match_attr is not None: + setattr(obj, match_attr, match_details[str(obj.pk)]) + self.results[start] = obj + start += 1 + + def __iter__(self): + return self + + def next(self): + # print "next()" + self._ensure_results() + ret = self[self.position] + self.position += 1 + return ret + + def __len__(self): + # print "__len__()" + self._ensure_results() + # this is perhaps not the ideal solution, but should work in general + return self.matches_upper_bound + + def __getslice__(self, start, end): + # print "__getslice__(%s, %s)" % (start, end) + self._ensure_results(start, end) + ret = [] + try: + for idx in range(start, end): + ret.append(self.results[idx]) + except KeyError: + # Slices fail silently + pass + return ret + + def __getitem__(self, index): + # print "__getitem__(%s)" % index + self._ensure_results(index) + try: + return self.results[index] + except KeyError: + # Indexing fails noisily + raise IndexError('list index out of range') + + return SearchResultSet(query, query_filter) + else: + # old-style interface + if start==None: + start=0 + if end==None: + end=10 + class QueryResult: + def __init__(self, results): + self.results = results + search_ids = [ item.docid for item in results.results ] + self._result_ids = [] + self._deets = {} + for item in results.results: + search_id = item.docid + database_name, model_key, id = search_id.split('.') + if model_key != model.__name__: + # FIXME: not right! + pass + self._result_ids.append(long(id)) + self._deets[id] = item + self._bulk = manager.in_bulk(self._result_ids) + + # From Flax, we get: matches_lower_bound, matches_upper_bound, more_matches, matches_estimated, matches_human_readable_estimate + def __getattr__(self, key): + """Provide .-access to aspects of the result. eg: q.doc_count (providing the search provider returns doc_count).""" + return getattr(self.results, key) + + def __len__(self): + return len(self._result_ids) + + def __iter__(self): + """ + Iterate over the results, in the order they were in the result set. + Return a decorated object, ie the Django model instance with an extra attribute (default 'match') containing match details (you mostly care about .rank, if provided). + """ + for key in self._result_ids: + obj = self._bulk[long(key)] + # From Flax we get: data (dict of field->data pairs), id, rank + # We only really care about rank at this stage, as we've pulled out the object. + if hasattr(model.Searchable, 'match_details_attribute'): + match_attr = model.Searchable.match_details_attribute + else: + match_attr = 'match' + if match_attr is not None: + setattr(obj, match_attr, self._deets[str(obj.pk)]) + yield obj + + results = client.search(query, query_filter, start, end) + return QueryResult(results) + return search diff --git a/searchify/utils.py b/searchify/utils.py new file mode 100644 index 0000000..f8fe2f7 --- /dev/null +++ b/searchify/utils.py @@ -0,0 +1,54 @@ +"""Utility functions for searchify. + +""" + +from django.db import models +from django.conf import settings + +if hasattr(settings, 'ENABLE_SEARCHIFY') and settings.ENABLE_SEARCHIFY: + def get_searcher(model_or_instance): + """Given a model or instance, find the searcher for it. + + """ + if not isinstance(model_or_instance, models.base.ModelBase): + model = type(model_or_instance) + else: + model = model_or_instance + if not hasattr(model, '_searchify'): + return None + if not hasattr(model._searchify, 'searcher'): + return None + return model._searchify.searcher + + def get_indexer(model_or_instance): + """Given a model or instance, find the indexer for it. + + """ + if not isinstance(model_or_instance, models.base.ModelBase): + model = type(model_or_instance) + else: + model = model_or_instance + if not hasattr(model, '_searchify'): + return None + if not hasattr(model._searchify, 'indexer'): + return None + return model._searchify.indexer +else: + get_indexer = get_searcher = lambda x: None + + +def lookup_model(modeldesc): + """Convert a packed docid into a Model. + + """ + try: + (app_label, model_name) = modeldesc.rsplit(".", 1) + except ValueError: + return None + return models.get_model(app_label, model_name) + + +def get_typename_from_object(instance): + return '%s.%s' % ( + instance._meta.app_label, instance._meta.object_name, + )