Support full text search for all languages using pgroonga.

This adds support for using PGroonga to back the Zulip full-text search feature. Because built-in PostgreSQL full text search doesn't support languages that don't put space between terms such as Japanese, Chinese and so on. PGroonga supports all languages including Japanese and Chinese. Developers will need to re-provision when rebasing past this patch for the tests to pass, since provision is what installs the PGroonga package and extension. PGroonga is enabled by default in development but not in production; the hope is that after the PGroonga support is tested further, we can enable it by default. Fixes zulip#615. [docs and tests tweaked by tabbott]
JimGomez48 · Aug 27, 2016 · 683f49a · 683f49a
1 parent 51f35f2
commit 683f49a
Show file tree

Hide file tree

Showing 11 changed files with 285 additions and 12 deletions.
diff --git a/docs/full-text-search.md b/docs/full-text-search.md
@@ -34,6 +34,74 @@ application server instead.
 
 ## An optional full-text search implementation
 
-See [the option PGroonga pull
-request](https://github.com/zulip/zulip/pull/700) for details on the
-status of the PGroonga integration.
+Zulip now supports using [PGroonga](http://pgroonga.github.io/) for
+full-text search. PGroonga is a PostgreSQL extension that provides
+full-text search feature. PostgreSQL's built-in full-text search
+feature supports only one language at a time (in Zulip's case,
+English).  PGroonga supports all languages simultaneously, including
+Japanese, Chinese and so on, all at once.  We expect to migrate
+Zulip's full-text search to only support PGroonga once we have tested
+this new extension fully.
+
+The following processes should be executed as the root user. Run:
+
+    sudo -i
+
+### How to enable full-text search against all languages
+
+This section describes how to enable using PGroonga to back the
+full-text search feature.
+
+* To install PGroonga, add `pgroonga = enabled` in the `[machine]`
+section in `/etc/zulip/zulip.conf`:
+
+    [machine]
+    ...
+    pgroonga = enabled
+
+And then run as root:
+
+    /home/zulip/deployments/current/scripts/zulip-puppet-apply
+
+Then, add `USING_PGROONGA = true` in `/etc/zulip/settings.py`:
+
+    USING_PGROONGA = True
+
+And apply the PGroonga migrations:
+
+    cd /srv/zulip
+    ./manage.py migrate pgroonga
+
+Note that the migration may take a long time, and you can't send new
+messages until the migration is finished.
+
+Once the migrations are complete, restart Zulip:
+
+    su zulip -c /home/zulip/deployments/current/scripts/restart-server
+
+Now, you can use full-text search against all languages.
+
+### How to disable full-text search against all languages
+
+This section describes how to disable full-text search feature based
+on PGroonga.
+
+If you want to fully remove PGroonga, first you need to remove the
+PGroonga column (as above, this will take a long time and no messages
+can be sent while it is running).  If you intend to re-enable PGroonga
+later, you can skip this step (at the cost of your Message table being
+slightly larger than it would be otherwise).
+
+    /home/zulip/deployments/current/manage.py migrate pgroonga zero
+
+Then, set `USING_PGROONGA = False` in `/etc/zulip/settings.py`:
+
+    USING_PGROONGA = False
+
+And, restart Zulip:
+
+    su zulip -c /home/zulip/deployments/current/scripts/restart-server
+
+Now, full-text search feature based on PGroonga is disabled.  If you'd
+like, you can also remove the `pgroonga = enabled` line in
+`/etc/zulip/zulip.conf` and uninstall the `pgroonga` packages.
diff --git a/pgroonga/__init__.py b/pgroonga/__init__.py
diff --git a/pgroonga/migrations/0001_enable.py b/pgroonga/migrations/0001_enable.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+from django.db import models, migrations
+from django.contrib.postgres import operations
+from django.conf import settings
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('zerver', '0001_initial'),
+    ]
+
+    database_setting = settings.DATABASES["default"]
+    if "postgres" in database_setting["ENGINE"]:
+        operations = [
+            migrations.RunSQL("""
+ALTER ROLE %(USER)s SET search_path TO %(SCHEMA)s,public,pgroonga,pg_catalog;
+
+SET search_path = %(SCHEMA)s,public,pgroonga,pg_catalog;
+
+ALTER TABLE zerver_message ADD COLUMN search_pgroonga text;
+
+UPDATE zerver_message SET search_pgroonga = subject || ' ' || rendered_content;
+
+-- TODO: We want to use CREATE INDEX CONCURRENTLY but it can't be used in
+-- transaction. Django uses transaction implicitly.
+-- Django 1.10 may solve the problem.
+CREATE INDEX zerver_message_search_pgroonga ON zerver_message
+  USING pgroonga(search_pgroonga pgroonga.text_full_text_search_ops);
+""" % database_setting,
+                              """
+SET search_path = %(SCHEMA)s,public,pgroonga,pg_catalog;
+
+DROP INDEX zerver_message_search_pgroonga;
+ALTER TABLE zerver_message DROP COLUMN search_pgroonga;
+
+SET search_path = %(SCHEMA)s,public;
+
+ALTER ROLE %(USER)s SET search_path TO %(SCHEMA)s,public;
+""" % database_setting),
+        ]
+    else:
+        operations = []
diff --git a/pgroonga/migrations/__init__.py b/pgroonga/migrations/__init__.py
diff --git a/puppet/zulip/files/postgresql/process_fts_updates b/puppet/zulip/files/postgresql/process_fts_updates
@@ -34,6 +34,11 @@ def update_fts_columns(cursor):
     cursor.execute("SELECT id, message_id FROM fts_update_log;")
     ids = []
     for (id, message_id) in cursor.fetchall():
+        if settings.USING_PGROONGA:
+            cursor.execute("UPDATE zerver_message SET "
+                           "search_pgroonga = "
+                           "subject || ' ' || rendered_content "
+                           "WHERE id = %s", (message_id,))
         cursor.execute("UPDATE zerver_message SET "
                        "search_tsvector = to_tsvector('zulip.english_us_search', "
                        "subject || rendered_content) "

diff --git a/puppet/zulip/manifests/postgres_appdb_base.pp b/puppet/zulip/manifests/postgres_appdb_base.pp
@@ -67,7 +67,7 @@
   }
 
   $pgroonga = zulipconf("machine", "pgroonga", "")
-  if $pgroonga != "" {
+  if $pgroonga == "enabled" {
     apt::ppa {'ppa:groonga/ppa':
       before => Package["postgresql-${zulip::base::postgres_version}-pgroonga"],
     }

diff --git a/tools/setup/postgres-init-dev-db b/tools/setup/postgres-init-dev-db
@@ -74,6 +74,8 @@ EOF
 
 $ROOT_POSTGRES "$DBNAME_BASE" << EOF
 CREATE EXTENSION tsearch_extras SCHEMA zulip;
+CREATE EXTENSION pgroonga;
+GRANT USAGE ON SCHEMA pgroonga TO $USERNAME;
 EOF
 
 psql -h localhost postgres "$USERNAME" <<EOF

diff --git a/zerver/tests/test_narrow.py b/zerver/tests/test_narrow.py
@@ -1,6 +1,9 @@
+# -*- coding: utf-8 -*-
+
 from __future__ import absolute_import
 from __future__ import print_function
 from django.db import connection
+from django.test import override_settings
 from sqlalchemy.sql import (
     and_, select, column, compiler
 )
@@ -163,14 +166,26 @@ def test_add_term_using_id_operator_and_negated(self):  # NEGATED
         term = dict(operator='id', operand=555, negated=True)
         self._do_add_term_test(term, 'WHERE id != :param_1')
 
+    @override_settings(USING_PGROONGA=False)
     def test_add_term_using_search_operator(self):
         term = dict(operator='search', operand='"french fries"')
         self._do_add_term_test(term, 'WHERE (lower(content) LIKE lower(:content_1) OR lower(subject) LIKE lower(:subject_1)) AND (search_tsvector @@ plainto_tsquery(:param_2, :param_3))')
 
+    @override_settings(USING_PGROONGA=False)
     def test_add_term_using_search_operator_and_negated(self):  # NEGATED
         term = dict(operator='search', operand='"french fries"', negated=True)
         self._do_add_term_test(term, 'WHERE NOT (lower(content) LIKE lower(:content_1) OR lower(subject) LIKE lower(:subject_1)) AND NOT (search_tsvector @@ plainto_tsquery(:param_2, :param_3))')
 
+    @override_settings(USING_PGROONGA=True)
+    def test_add_term_using_search_operator_pgroonga(self):
+        term = dict(operator='search', operand='"french fries"')
+        self._do_add_term_test(term, 'WHERE search_pgroonga @@ :search_pgroonga_1')
+
+    @override_settings(USING_PGROONGA=True)
+    def test_add_term_using_search_operator_and_negated_pgroonga(self):  # NEGATED
+        term = dict(operator='search', operand='"french fries"', negated=True)
+        self._do_add_term_test(term, 'WHERE NOT (search_pgroonga @@ :search_pgroonga_1)')
+
     def test_add_term_using_has_operator_and_attachment_operand(self):
         term = dict(operator='has', operand='attachment')
         self._do_add_term_test(term, 'WHERE has_attachment')
@@ -487,6 +502,7 @@ def test_get_old_messages_with_narrow_sender(self):
         for message in result["messages"]:
             self.assertEqual(message["sender_email"], "[email protected]")
 
+    @override_settings(USING_PGROONGA=False)
     def test_get_old_messages_with_search(self):
         self.login("[email protected]")
 
@@ -513,10 +529,10 @@ def test_get_old_messages_with_search(self):
         # the search index up to date.
         with connection.cursor() as cursor:
             cursor.execute("""
-                UPDATE zerver_message SET
-                search_tsvector = to_tsvector('zulip.english_us_search',
-                subject || rendered_content)
-                """)
+            UPDATE zerver_message SET
+            search_tsvector = to_tsvector('zulip.english_us_search',
+            subject || rendered_content)
+            """)
 
         narrow = [
             dict(operator='sender', operand='[email protected]'),
@@ -547,6 +563,64 @@ def test_get_old_messages_with_search(self):
             meeting_message['match_content'],
             '<p>I am hungry!</p>')
 
+    @override_settings(USING_PGROONGA=True)
+    def test_get_old_messages_with_search_pgroonga(self):
+        self.login("[email protected]")
+
+        messages_to_search = [
+            (u'日本語', u'こんにちは。今日はいい天気ですね。'),
+            (u'日本語', u'今朝はごはんを食べました。'),
+            (u'日本語', u'昨日、日本のお菓子を送りました。'),
+            ('english', u'I want to go to 日本!'),
+            ('english', 'Can you speak Japanese?'),
+        ]
+
+        for topic, content in messages_to_search:
+            self.send_message(
+                sender_name="[email protected]",
+                raw_recipients="Verona",
+                message_type=Recipient.STREAM,
+                content=content,
+                subject=topic,
+            )
+
+        # We use brute force here and update our text search index
+        # for the entire zerver_message table (which is small in test
+        # mode).  In production there is an async process which keeps
+        # the search index up to date.
+        with connection.cursor() as cursor:
+            cursor.execute("""
+                UPDATE zerver_message SET
+                search_pgroonga = subject || ' ' || rendered_content
+                """)
+
+        narrow = [
+            dict(operator='search', operand=u'日本'),
+        ]
+        result = self.get_and_check_messages(dict(
+            narrow=ujson.dumps(narrow),
+            anchor=0,
+            num_after=10,
+        ))
+        self.assertEqual(len(result['messages']), 4)
+        messages = result['messages']
+
+        japanese_message = [m for m in messages if m['subject'] == u'日本語'][-1]
+        self.assertEqual(
+            japanese_message['match_subject'],
+            u'<span class="highlight">日本</span>語')
+        self.assertEqual(
+            japanese_message['match_content'],
+            u'<p>昨日、<span class="highlight">日本</span>の' +
+            u'お菓子を送りました。</p>')
+
+        english_message = [m for m in messages if m['subject'] == 'english'][0]
+        self.assertEqual(
+            english_message['match_subject'],
+            'english')
+        self.assertEqual(
+            english_message['match_content'],
+            u'<p>I want to go to <span class="highlight">日本</span>!</p>')
 
     def test_get_old_messages_with_only_searching_anchor(self):
         """
@@ -949,6 +1023,7 @@ def test_get_old_messages_with_narrow_queries(self):
                                                   'narrow': '[["stream", "Scotland"], ["is", "starred"]]'},
                                                  sql)
 
+    @override_settings(USING_PGROONGA=False)
     def test_get_old_messages_with_search_queries(self):
         query_ids = self.get_query_ids()
 
@@ -969,3 +1044,25 @@ def test_get_old_messages_with_search_queries(self):
         self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10,
                                                   'narrow': '[["search", "\\"jumping\\" quickly"]]'},
                                                  sql)
+
+    @override_settings(USING_PGROONGA=True)
+    def test_get_old_messages_with_search_queries_pgroonga(self):
+        query_ids = self.get_query_ids()
+
+        sql_template = u"SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.subject_matches \nFROM (SELECT message_id, flags, subject, rendered_content, pgroonga.match_positions_byte(rendered_content, pgroonga.query_extract_keywords('jumping')) AS content_matches, pgroonga.match_positions_byte(subject, pgroonga.query_extract_keywords('jumping')) AS subject_matches \nFROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \nWHERE user_profile_id = 2 AND (search_pgroonga @@ 'jumping') AND message_id >= 0 ORDER BY message_id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC"
+        sql = sql_template.format(**query_ids)
+        self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10,
+                                                  'narrow': '[["search", "jumping"]]'},
+                                                 sql)
+
+        sql_template = "SELECT anon_1.message_id, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.subject_matches \nFROM (SELECT id AS message_id, subject, rendered_content, pgroonga.match_positions_byte(rendered_content, pgroonga.query_extract_keywords('jumping')) AS content_matches, pgroonga.match_positions_byte(subject, pgroonga.query_extract_keywords('jumping')) AS subject_matches \nFROM zerver_message \nWHERE recipient_id = 9 AND (search_pgroonga @@ 'jumping') AND zerver_message.id >= 0 ORDER BY zerver_message.id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC"
+        sql = sql_template.format(**query_ids)
+        self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10,
+                                                  'narrow': '[["stream", "Scotland"], ["search", "jumping"]]'},
+                                                 sql)
+
+        sql_template = 'SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.subject_matches \nFROM (SELECT message_id, flags, subject, rendered_content, pgroonga.match_positions_byte(rendered_content, pgroonga.query_extract_keywords(\'"jumping" quickly\')) AS content_matches, pgroonga.match_positions_byte(subject, pgroonga.query_extract_keywords(\'"jumping" quickly\')) AS subject_matches \nFROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \nWHERE user_profile_id = 2 AND (search_pgroonga @@ \'"jumping" quickly\') AND message_id >= 0 ORDER BY message_id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC'
+        sql = sql_template.format(**query_ids)
+        self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10,
+                                                  'narrow': '[["search", "\\"jumping\\" quickly"]]'},
+                                                 sql)
diff --git a/zerver/views/messages.py b/zerver/views/messages.py
@@ -10,7 +10,7 @@
 from django.http import HttpRequest, HttpResponse
 from six import text_type
 from typing import Any, AnyStr, Iterable, Optional, Tuple
-from zerver.lib.str_utils import force_text
+from zerver.lib.str_utils import force_bytes, force_text
 
 from zerver.decorator import authenticated_api_view, authenticated_json_post_view, \
     has_request_variables, REQ, JsonableError, \
@@ -236,6 +236,23 @@ def by_pm_with(self, query, operand, maybe_negate):
             return query.where(maybe_negate(cond))
 
     def by_search(self, query, operand, maybe_negate):
+        if settings.USING_PGROONGA:
+            return self._by_search_pgroonga(query, operand, maybe_negate)
+        else:
+            return self._by_search_tsearch(query, operand, maybe_negate)
+
+    def _by_search_pgroonga(self, query, operand, maybe_negate):
+        match_positions_byte = func.pgroonga.match_positions_byte
+        query_extract_keywords = func.pgroonga.query_extract_keywords
+        keywords = query_extract_keywords(operand)
+        query = query.column(match_positions_byte(column("rendered_content"),
+                                                  keywords).label("content_matches"))
+        query = query.column(match_positions_byte(column("subject"),
+                                                  keywords).label("subject_matches"))
+        condition = column("search_pgroonga").op("@@")(operand)
+        return query.where(maybe_negate(condition))
+
+    def _by_search_tsearch(self, query, operand, maybe_negate):
         tsquery = func.plainto_tsquery(literal("zulip.english_us_search"), literal(operand))
         ts_locs_array = func.ts_match_locs_array
         query = query.column(ts_locs_array(literal("zulip.english_us_search"),
@@ -264,7 +281,7 @@ def by_search(self, query, operand, maybe_negate):
 # Apparently, the offsets we get from tsearch_extras are counted in
 # unicode characters, not in bytes, so we do our processing with text,
 # not bytes.
-def highlight_string(text, locs):
+def highlight_string_text_offsets(text, locs):
     # type: (AnyStr, Iterable[Tuple[int, int]]) -> text_type
     string = force_text(text)
     highlight_start = u'<span class="highlight">'
@@ -281,6 +298,30 @@ def highlight_string(text, locs):
     result += string[pos:]
     return result
 
+def highlight_string_bytes_offsets(text, locs):
+    # type: (AnyStr, Iterable[Tuple[int, int]]) -> text_type
+    string = force_bytes(text)
+    highlight_start = b'<span class="highlight">'
+    highlight_stop = b'</span>'
+    pos = 0
+    result = b''
+    for loc in locs:
+        (offset, length) = loc
+        result += string[pos:offset]
+        result += highlight_start
+        result += string[offset:offset + length]
+        result += highlight_stop
+        pos = offset + length
+    result += string[pos:]
+    return force_text(result)
+
+def highlight_string(text, locs):
+    # type: (AnyStr, Iterable[Tuple[int, int]]) -> text_type
+    if settings.USING_PGROONGA:
+        return highlight_string_bytes_offsets(text, locs)
+    else:
+        return highlight_string_text_offsets(text, locs)
+
 def get_search_fields(rendered_content, subject, content_matches, subject_matches):
     # type: (text_type, text_type, Iterable[Tuple[int, int]], Iterable[Tuple[int, int]]) -> Dict[str, text_type]
     return dict(match_content=highlight_string(rendered_content, content_matches),

diff --git a/zproject/dev_settings.py b/zproject/dev_settings.py
@@ -25,3 +25,4 @@
 SAVE_FRONTEND_STACKTRACES = True
 EVENT_LOGS_ENABLED = True
 SYSTEM_ONLY_REALMS = set() # type: Set[str]
+USING_PGROONGA = True