forked from zulip/zulip
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support full text search for all languages using pgroonga.
This adds support for using PGroonga to back the Zulip full-text search feature. Because built-in PostgreSQL full text search doesn't support languages that don't put space between terms such as Japanese, Chinese and so on. PGroonga supports all languages including Japanese and Chinese. Developers will need to re-provision when rebasing past this patch for the tests to pass, since provision is what installs the PGroonga package and extension. PGroonga is enabled by default in development but not in production; the hope is that after the PGroonga support is tested further, we can enable it by default. Fixes zulip#615. [docs and tests tweaked by tabbott]
- Loading branch information
Showing
11 changed files
with
285 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
# -*- coding: utf-8 -*- | ||
from __future__ import unicode_literals | ||
|
||
from django.db import models, migrations | ||
from django.contrib.postgres import operations | ||
from django.conf import settings | ||
|
||
|
||
class Migration(migrations.Migration): | ||
|
||
dependencies = [ | ||
('zerver', '0001_initial'), | ||
] | ||
|
||
database_setting = settings.DATABASES["default"] | ||
if "postgres" in database_setting["ENGINE"]: | ||
operations = [ | ||
migrations.RunSQL(""" | ||
ALTER ROLE %(USER)s SET search_path TO %(SCHEMA)s,public,pgroonga,pg_catalog; | ||
SET search_path = %(SCHEMA)s,public,pgroonga,pg_catalog; | ||
ALTER TABLE zerver_message ADD COLUMN search_pgroonga text; | ||
UPDATE zerver_message SET search_pgroonga = subject || ' ' || rendered_content; | ||
-- TODO: We want to use CREATE INDEX CONCURRENTLY but it can't be used in | ||
-- transaction. Django uses transaction implicitly. | ||
-- Django 1.10 may solve the problem. | ||
CREATE INDEX zerver_message_search_pgroonga ON zerver_message | ||
USING pgroonga(search_pgroonga pgroonga.text_full_text_search_ops); | ||
""" % database_setting, | ||
""" | ||
SET search_path = %(SCHEMA)s,public,pgroonga,pg_catalog; | ||
DROP INDEX zerver_message_search_pgroonga; | ||
ALTER TABLE zerver_message DROP COLUMN search_pgroonga; | ||
SET search_path = %(SCHEMA)s,public; | ||
ALTER ROLE %(USER)s SET search_path TO %(SCHEMA)s,public; | ||
""" % database_setting), | ||
] | ||
else: | ||
operations = [] |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,9 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
from __future__ import absolute_import | ||
from __future__ import print_function | ||
from django.db import connection | ||
from django.test import override_settings | ||
from sqlalchemy.sql import ( | ||
and_, select, column, compiler | ||
) | ||
|
@@ -163,14 +166,26 @@ def test_add_term_using_id_operator_and_negated(self): # NEGATED | |
term = dict(operator='id', operand=555, negated=True) | ||
self._do_add_term_test(term, 'WHERE id != :param_1') | ||
|
||
@override_settings(USING_PGROONGA=False) | ||
def test_add_term_using_search_operator(self): | ||
term = dict(operator='search', operand='"french fries"') | ||
self._do_add_term_test(term, 'WHERE (lower(content) LIKE lower(:content_1) OR lower(subject) LIKE lower(:subject_1)) AND (search_tsvector @@ plainto_tsquery(:param_2, :param_3))') | ||
|
||
@override_settings(USING_PGROONGA=False) | ||
def test_add_term_using_search_operator_and_negated(self): # NEGATED | ||
term = dict(operator='search', operand='"french fries"', negated=True) | ||
self._do_add_term_test(term, 'WHERE NOT (lower(content) LIKE lower(:content_1) OR lower(subject) LIKE lower(:subject_1)) AND NOT (search_tsvector @@ plainto_tsquery(:param_2, :param_3))') | ||
|
||
@override_settings(USING_PGROONGA=True) | ||
def test_add_term_using_search_operator_pgroonga(self): | ||
term = dict(operator='search', operand='"french fries"') | ||
self._do_add_term_test(term, 'WHERE search_pgroonga @@ :search_pgroonga_1') | ||
|
||
@override_settings(USING_PGROONGA=True) | ||
def test_add_term_using_search_operator_and_negated_pgroonga(self): # NEGATED | ||
term = dict(operator='search', operand='"french fries"', negated=True) | ||
self._do_add_term_test(term, 'WHERE NOT (search_pgroonga @@ :search_pgroonga_1)') | ||
|
||
def test_add_term_using_has_operator_and_attachment_operand(self): | ||
term = dict(operator='has', operand='attachment') | ||
self._do_add_term_test(term, 'WHERE has_attachment') | ||
|
@@ -487,6 +502,7 @@ def test_get_old_messages_with_narrow_sender(self): | |
for message in result["messages"]: | ||
self.assertEqual(message["sender_email"], "[email protected]") | ||
|
||
@override_settings(USING_PGROONGA=False) | ||
def test_get_old_messages_with_search(self): | ||
self.login("[email protected]") | ||
|
||
|
@@ -513,10 +529,10 @@ def test_get_old_messages_with_search(self): | |
# the search index up to date. | ||
with connection.cursor() as cursor: | ||
cursor.execute(""" | ||
UPDATE zerver_message SET | ||
search_tsvector = to_tsvector('zulip.english_us_search', | ||
subject || rendered_content) | ||
""") | ||
UPDATE zerver_message SET | ||
search_tsvector = to_tsvector('zulip.english_us_search', | ||
subject || rendered_content) | ||
""") | ||
|
||
narrow = [ | ||
dict(operator='sender', operand='[email protected]'), | ||
|
@@ -547,6 +563,64 @@ def test_get_old_messages_with_search(self): | |
meeting_message['match_content'], | ||
'<p>I am hungry!</p>') | ||
|
||
@override_settings(USING_PGROONGA=True) | ||
def test_get_old_messages_with_search_pgroonga(self): | ||
self.login("[email protected]") | ||
|
||
messages_to_search = [ | ||
(u'日本語', u'こんにちは。今日はいい天気ですね。'), | ||
(u'日本語', u'今朝はごはんを食べました。'), | ||
(u'日本語', u'昨日、日本のお菓子を送りました。'), | ||
('english', u'I want to go to 日本!'), | ||
('english', 'Can you speak Japanese?'), | ||
] | ||
|
||
for topic, content in messages_to_search: | ||
self.send_message( | ||
sender_name="[email protected]", | ||
raw_recipients="Verona", | ||
message_type=Recipient.STREAM, | ||
content=content, | ||
subject=topic, | ||
) | ||
|
||
# We use brute force here and update our text search index | ||
# for the entire zerver_message table (which is small in test | ||
# mode). In production there is an async process which keeps | ||
# the search index up to date. | ||
with connection.cursor() as cursor: | ||
cursor.execute(""" | ||
UPDATE zerver_message SET | ||
search_pgroonga = subject || ' ' || rendered_content | ||
""") | ||
|
||
narrow = [ | ||
dict(operator='search', operand=u'日本'), | ||
] | ||
result = self.get_and_check_messages(dict( | ||
narrow=ujson.dumps(narrow), | ||
anchor=0, | ||
num_after=10, | ||
)) | ||
self.assertEqual(len(result['messages']), 4) | ||
messages = result['messages'] | ||
|
||
japanese_message = [m for m in messages if m['subject'] == u'日本語'][-1] | ||
self.assertEqual( | ||
japanese_message['match_subject'], | ||
u'<span class="highlight">日本</span>語') | ||
self.assertEqual( | ||
japanese_message['match_content'], | ||
u'<p>昨日、<span class="highlight">日本</span>の' + | ||
u'お菓子を送りました。</p>') | ||
|
||
english_message = [m for m in messages if m['subject'] == 'english'][0] | ||
self.assertEqual( | ||
english_message['match_subject'], | ||
'english') | ||
self.assertEqual( | ||
english_message['match_content'], | ||
u'<p>I want to go to <span class="highlight">日本</span>!</p>') | ||
|
||
def test_get_old_messages_with_only_searching_anchor(self): | ||
""" | ||
|
@@ -949,6 +1023,7 @@ def test_get_old_messages_with_narrow_queries(self): | |
'narrow': '[["stream", "Scotland"], ["is", "starred"]]'}, | ||
sql) | ||
|
||
@override_settings(USING_PGROONGA=False) | ||
def test_get_old_messages_with_search_queries(self): | ||
query_ids = self.get_query_ids() | ||
|
||
|
@@ -969,3 +1044,25 @@ def test_get_old_messages_with_search_queries(self): | |
self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10, | ||
'narrow': '[["search", "\\"jumping\\" quickly"]]'}, | ||
sql) | ||
|
||
@override_settings(USING_PGROONGA=True) | ||
def test_get_old_messages_with_search_queries_pgroonga(self): | ||
query_ids = self.get_query_ids() | ||
|
||
sql_template = u"SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.subject_matches \nFROM (SELECT message_id, flags, subject, rendered_content, pgroonga.match_positions_byte(rendered_content, pgroonga.query_extract_keywords('jumping')) AS content_matches, pgroonga.match_positions_byte(subject, pgroonga.query_extract_keywords('jumping')) AS subject_matches \nFROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \nWHERE user_profile_id = 2 AND (search_pgroonga @@ 'jumping') AND message_id >= 0 ORDER BY message_id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC" | ||
sql = sql_template.format(**query_ids) | ||
self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10, | ||
'narrow': '[["search", "jumping"]]'}, | ||
sql) | ||
|
||
sql_template = "SELECT anon_1.message_id, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.subject_matches \nFROM (SELECT id AS message_id, subject, rendered_content, pgroonga.match_positions_byte(rendered_content, pgroonga.query_extract_keywords('jumping')) AS content_matches, pgroonga.match_positions_byte(subject, pgroonga.query_extract_keywords('jumping')) AS subject_matches \nFROM zerver_message \nWHERE recipient_id = 9 AND (search_pgroonga @@ 'jumping') AND zerver_message.id >= 0 ORDER BY zerver_message.id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC" | ||
sql = sql_template.format(**query_ids) | ||
self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10, | ||
'narrow': '[["stream", "Scotland"], ["search", "jumping"]]'}, | ||
sql) | ||
|
||
sql_template = 'SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.subject_matches \nFROM (SELECT message_id, flags, subject, rendered_content, pgroonga.match_positions_byte(rendered_content, pgroonga.query_extract_keywords(\'"jumping" quickly\')) AS content_matches, pgroonga.match_positions_byte(subject, pgroonga.query_extract_keywords(\'"jumping" quickly\')) AS subject_matches \nFROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \nWHERE user_profile_id = 2 AND (search_pgroonga @@ \'"jumping" quickly\') AND message_id >= 0 ORDER BY message_id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC' | ||
sql = sql_template.format(**query_ids) | ||
self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10, | ||
'narrow': '[["search", "\\"jumping\\" quickly"]]'}, | ||
sql) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.