support for Elastic(Open)search CJK analysis plugins (#34784)

* support for Elastic(Open)search CJK analysis plugins

* addresses PR review comments

* addresses PR comments

* moves CJK based tests to own file and adds some more

Dockerfile for open and elasticsearch are changed to install required
plugins for testing or running locally.

* properly sort error messages

* fix style issues

* removes trailing space
This commit is contained in:
Carlos Garcia 2026-02-12 10:05:23 +01:00 committed by GitHub
parent 671e2b7640
commit feca30d85f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 592 additions and 87 deletions

View file

@ -484,6 +484,7 @@
"Password": "changeme",
"EnableIndexing": false,
"EnableSearching": false,
"EnableCJKAnalyzers": false,
"EnableAutocomplete": false,
"Sniff": false,
"PostIndexReplicas": 1,

View file

@ -627,6 +627,7 @@ const defaultServerConfig: AdminConfig = {
Password: 'changeme',
EnableIndexing: false,
EnableSearching: false,
EnableCJKAnalyzers: false,
EnableAutocomplete: false,
Sniff: true,
PostIndexReplicas: 1,

View file

@ -112,6 +112,7 @@ linters:
channels/store/localcachelayer/webhook_layer_test.go|\
channels/store/retrylayer/retrylayer_test.go|\
channels/store/searchtest/channel_layer.go|\
channels/store/searchtest/cjk_plugins.go|\
channels/store/searchtest/file_info_layer.go|\
channels/store/searchtest/helper.go|\
channels/store/searchtest/post_layer.go|\

View file

@ -0,0 +1,4 @@
ARG ELASTICSEARCH_VERSION=8.9.0
FROM mattermostdevelopment/mattermost-elasticsearch:${ELASTICSEARCH_VERSION}
RUN /usr/share/elasticsearch/bin/elasticsearch-plugin install --batch analysis-nori analysis-kuromoji analysis-smartcn

View file

@ -1,4 +1,4 @@
ARG OPENSEARCH_VERSION=2.7.0
FROM opensearchproject/opensearch:$OPENSEARCH_VERSION
RUN /usr/share/opensearch/bin/opensearch-plugin install analysis-icu
RUN /usr/share/opensearch/bin/opensearch-plugin install analysis-icu analysis-nori analysis-kuromoji analysis-smartcn

View file

@ -47,7 +47,9 @@ services:
LDAP_DOMAIN: "mm.test.com"
LDAP_ADMIN_PASSWORD: "mostest"
elasticsearch:
image: "mattermostdevelopment/mattermost-elasticsearch:8.9.0"
build:
context: .
dockerfile: ./Dockerfile.elasticsearch
networks:
- mm-test
environment:

View file

@ -0,0 +1,253 @@
// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.txt for license information.
package searchtest
import (
"testing"
"github.com/mattermost/mattermost/server/public/model"
"github.com/mattermost/mattermost/server/public/shared/request"
"github.com/mattermost/mattermost/server/v8/channels/store"
"github.com/stretchr/testify/require"
)
func TestSearchPostStoreEnabledCJK(t *testing.T, s store.Store) {
th := &SearchTestHelper{
Context: request.TestContext(t),
Store: s,
}
err := th.SetupBasicFixtures()
require.NoError(t, err)
defer th.CleanFixtures()
t.Run("Korean searches using nori analyzer", func(t *testing.T) {
t.Run("should be able to search with wildcard and exact search", func(t *testing.T) {
p1, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "한글", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
p2, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "한국", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
defer th.deleteUserPosts(th.User.Id)
// Exact search
params := &model.SearchParams{Terms: "한글"}
results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20)
require.NoError(t, err)
require.Len(t, results.Posts, 1)
th.checkPostInSearchResults(t, p1.Id, results.Posts)
// Wildcard search
params = &model.SearchParams{Terms: "한*"}
results, err = th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20)
require.NoError(t, err)
require.Len(t, results.Posts, 2)
th.checkPostInSearchResults(t, p1.Id, results.Posts)
th.checkPostInSearchResults(t, p2.Id, results.Posts)
})
t.Run("should search one word and phrase with Nori segmentation", func(t *testing.T) {
pBul, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "불", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
pBulda, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "불다", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
p3, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "소고기덮밥", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
p4, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "치킨덮밥", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
defer th.deleteUserPosts(th.User.Id)
// One word "불": Nori segments "불다" into "불"+"다", so "불" matches both posts
params := &model.SearchParams{Terms: "불"}
results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20)
require.NoError(t, err)
require.Len(t, results.Posts, 2)
th.checkPostInSearchResults(t, pBul.Id, results.Posts)
th.checkPostInSearchResults(t, pBulda.Id, results.Posts)
// Unquoted "불다": SimpleQueryString treats one term as OR of analyzed tokens (불 OR 다), so both posts match
params = &model.SearchParams{Terms: "불다"}
results, err = th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20)
require.NoError(t, err)
require.Len(t, results.Posts, 2)
th.checkPostInSearchResults(t, pBul.Id, results.Posts)
th.checkPostInSearchResults(t, pBulda.Id, results.Posts)
// Unquoted 덮밥 should match 소고기덮밥 and 치킨덮밥
params = &model.SearchParams{Terms: "덮밥"}
results, err = th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20)
require.NoError(t, err)
require.Len(t, results.Posts, 2)
th.checkPostInSearchResults(t, p3.Id, results.Posts)
th.checkPostInSearchResults(t, p4.Id, results.Posts)
})
t.Run("should search in mixed Korean and English content", func(t *testing.T) {
p5, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "오늘 회의실 예약 meeting", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
defer th.deleteUserPosts(th.User.Id)
params := &model.SearchParams{Terms: "회의실"}
results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20)
require.NoError(t, err)
require.Len(t, results.Posts, 1)
th.checkPostInSearchResults(t, p5.Id, results.Posts)
})
t.Run("should search using phrase search", func(t *testing.T) {
p6, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "오늘 회의실 예약", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
defer th.deleteUserPosts(th.User.Id)
params := &model.SearchParams{Terms: "\"오늘 회의실\""}
results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20)
require.NoError(t, err)
require.Len(t, results.Posts, 1)
th.checkPostInSearchResults(t, p6.Id, results.Posts)
})
})
t.Run("Japanese searches using kuromoji analyzer", func(t *testing.T) {
t.Run("should be able to search using wildcard and exact search", func(t *testing.T) {
p1, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "東京", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
p2, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "東北", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
defer th.deleteUserPosts(th.User.Id)
// Exact match
params := &model.SearchParams{Terms: "東京"}
results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20)
require.NoError(t, err)
require.Len(t, results.Posts, 1)
th.checkPostInSearchResults(t, p1.Id, results.Posts)
// Wildcard search
params = &model.SearchParams{Terms: "東*"}
results, err = th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20)
require.NoError(t, err)
require.Len(t, results.Posts, 2)
th.checkPostInSearchResults(t, p1.Id, results.Posts)
th.checkPostInSearchResults(t, p2.Id, results.Posts)
})
t.Run("should search in mixed Japanese and English content", func(t *testing.T) {
p4, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "projectの締め切りは来週", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
defer th.deleteUserPosts(th.User.Id)
params := &model.SearchParams{Terms: "締め切り"}
results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20)
require.NoError(t, err)
require.Len(t, results.Posts, 1)
th.checkPostInSearchResults(t, p4.Id, results.Posts)
})
t.Run("should search using phrase search", func(t *testing.T) {
p3, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "今日の会議は中止です", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
defer th.deleteUserPosts(th.User.Id)
params := &model.SearchParams{Terms: "\"今日の会議\""}
results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20)
require.NoError(t, err)
require.Len(t, results.Posts, 1)
th.checkPostInSearchResults(t, p3.Id, results.Posts)
})
t.Run("should find conjugated verb forms when searching infinitive", func(t *testing.T) {
// Create posts with different verb conjugations
pInfinitive, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "食べる", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
pPast, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "昨日ラーメンを食べました", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
pTe, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "食べている", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
pNegative, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "食べない", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
defer th.deleteUserPosts(th.User.Id)
// Search for infinitive form should find all conjugated forms
params := &model.SearchParams{Terms: "食べる"}
results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20)
require.NoError(t, err)
require.Len(t, results.Posts, 4)
th.checkPostInSearchResults(t, pInfinitive.Id, results.Posts)
th.checkPostInSearchResults(t, pPast.Id, results.Posts)
th.checkPostInSearchResults(t, pTe.Id, results.Posts)
th.checkPostInSearchResults(t, pNegative.Id, results.Posts)
})
})
t.Run("Chinese searches using smartcn analyzer", func(t *testing.T) {
t.Run("should be able to search using wildcard and exact search", func(t *testing.T) {
p1, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "电脑", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
p2, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "电话", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
defer th.deleteUserPosts(th.User.Id)
// Exact search
params := &model.SearchParams{Terms: "电脑"}
results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20)
require.NoError(t, err)
require.Len(t, results.Posts, 1)
th.checkPostInSearchResults(t, p1.Id, results.Posts)
// Wildcard search
params = &model.SearchParams{Terms: "电*"}
results, err = th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20)
require.NoError(t, err)
require.Len(t, results.Posts, 2)
th.checkPostInSearchResults(t, p1.Id, results.Posts)
th.checkPostInSearchResults(t, p2.Id, results.Posts)
})
t.Run("should search one and two characters with SmartCN segmentation (你 / 你好)", func(t *testing.T) {
pNiHao, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "你好", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
pNi, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "你", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
defer th.deleteUserPosts(th.User.Id)
// One character: SmartCN segments "你好" into "你"+"好", so "你" matches both
params := &model.SearchParams{Terms: "你"}
results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20)
require.NoError(t, err)
require.Len(t, results.Posts, 2)
th.checkPostInSearchResults(t, pNi.Id, results.Posts)
th.checkPostInSearchResults(t, pNiHao.Id, results.Posts)
// Two characters (unquoted): SimpleQueryString matches any analyzed token (你 OR 好), so both posts match
params = &model.SearchParams{Terms: "你好"}
results, err = th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20)
require.NoError(t, err)
require.Len(t, results.Posts, 2)
th.checkPostInSearchResults(t, pNi.Id, results.Posts)
th.checkPostInSearchResults(t, pNiHao.Id, results.Posts)
})
t.Run("should search in mixed Chinese and English content", func(t *testing.T) {
p3, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "this is 今天开会讨论API接口 content", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
defer th.deleteUserPosts(th.User.Id)
params := &model.SearchParams{Terms: "接口"}
results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20)
require.NoError(t, err)
require.Len(t, results.Posts, 1)
th.checkPostInSearchResults(t, p3.Id, results.Posts)
})
t.Run("should search using phrase search", func(t *testing.T) {
p4, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "this is 今天开会讨论API接口 content", "", model.PostTypeDefault, 0, false)
require.NoError(t, err)
defer th.deleteUserPosts(th.User.Id)
params := &model.SearchParams{Terms: "\"今天开会\""}
results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20)
require.NoError(t, err)
require.Len(t, results.Posts, 1)
th.checkPostInSearchResults(t, p4.Id, results.Posts)
})
})
}

View file

@ -1,6 +1,5 @@
services:
elasticsearch:
image: "mattermostdevelopment/mattermost-elasticsearch:8.9.0"
platform: linux/arm64/v8
restart: 'no'
container_name: mattermost-elasticsearch

View file

@ -12,6 +12,7 @@ import (
"runtime"
"strings"
"time"
"unicode"
"github.com/mattermost/mattermost/server/public/model"
"github.com/mattermost/mattermost/server/v8/platform/services/searchengine"
@ -395,3 +396,16 @@ func GetMatchesForHit(highlights map[string][]string) ([]string, error) {
return matches, nil
}
func ContainsCJK(s string) bool {
for _, r := range s {
if unicode.Is(unicode.Han, r) || // Chinese characters (also used in Japanese)
unicode.Is(unicode.Hangul, r) || // Korean
unicode.Is(unicode.Hiragana, r) || // Japanese
unicode.Is(unicode.Katakana, r) { // Japanese
return true
}
}
return false
}

View file

@ -9,91 +9,191 @@ import (
"github.com/elastic/go-elasticsearch/v8/typedapi/indices/putindextemplate"
"github.com/elastic/go-elasticsearch/v8/typedapi/types"
"github.com/mattermost/mattermost/server/public/model"
"github.com/mattermost/mattermost/server/public/shared/mlog"
)
func GetPostTemplate(cfg *model.Config) *putindextemplate.Request {
mappings := &types.TypeMapping{
Properties: map[string]types.Property{
"message": types.TextProperty{
Analyzer: model.NewPointer("mm_lowercaser"),
Type: "text",
func addQueryProperty(mappings *types.TypeMapping, propertyName string, analyzer string) {
// property name must be a text property due to default mappings
if textProp, ok := mappings.Properties[propertyName].(types.TextProperty); ok {
if textProp.Fields == nil {
textProp.Fields = make(map[string]types.Property)
}
textProp.Fields[analyzer] = types.TextProperty{
Analyzer: model.NewPointer("mm_" + analyzer),
Type: "text",
}
mappings.Properties[propertyName] = textProp
}
}
func WithNoriAnalyzer() func(template *types.IndexTemplateMapping) {
return func(template *types.IndexTemplateMapping) {
mlog.Info("using nori analyzer")
addQueryProperty(template.Mappings, "message", "nori")
addQueryProperty(template.Mappings, "attachments", "nori")
template.Settings.Analysis.Tokenizer["nori_tokenizer"] = map[string]any{
"type": "nori_tokenizer",
"decompound_mode": "mixed",
}
template.Settings.Analysis.Analyzer["mm_nori"] = map[string]any{
"char_filter": []string{
"leading_underscores",
"trailing_underscores",
},
"attachments": types.TextProperty{
Analyzer: model.NewPointer("mm_lowercaser"),
Type: "text",
"tokenizer": "nori_tokenizer",
"filter": []string{
"nori_readingform",
"nori_part_of_speech",
"lowercase",
},
"urls": types.TextProperty{
Analyzer: model.NewPointer("mm_url"),
Type: "text",
}
}
}
func WithKuromojiAnalyzer() func(template *types.IndexTemplateMapping) {
return func(template *types.IndexTemplateMapping) {
mlog.Info("using kuromoji analyzer")
addQueryProperty(template.Mappings, "message", "kuromoji")
addQueryProperty(template.Mappings, "attachments", "kuromoji")
template.Settings.Analysis.Tokenizer["kuromoji_tokenizer"] = map[string]any{
"type": "kuromoji_tokenizer",
"mode": "search",
}
template.Settings.Analysis.Analyzer["mm_kuromoji"] = map[string]any{
"char_filter": []string{
"leading_underscores",
"trailing_underscores",
},
"hashtags": types.KeywordProperty{
Type: "keyword",
Normalizer: model.NewPointer("mm_hashtag"),
Store: model.NewPointer(true),
"tokenizer": "kuromoji_tokenizer",
"filter": []string{
"kuromoji_baseform",
"kuromoji_part_of_speech",
"ja_stop",
"kuromoji_stemmer",
"lowercase",
},
}
}
}
func WithSmartCNAnalyzer() func(template *types.IndexTemplateMapping) {
return func(template *types.IndexTemplateMapping) {
mlog.Info("using smartcn analyzer")
addQueryProperty(template.Mappings, "message", "smartcn")
addQueryProperty(template.Mappings, "attachments", "smartcn")
template.Settings.Analysis.Analyzer["mm_smartcn"] = map[string]any{
"char_filter": []string{
"leading_underscores",
"trailing_underscores",
},
"tokenizer": "smartcn_tokenizer",
"filter": []string{
"smartcn_stop",
"lowercase",
},
}
}
}
func GetPostTemplate(cfg *model.Config, opts ...func(*types.IndexTemplateMapping)) *putindextemplate.Request {
template := &types.IndexTemplateMapping{
Settings: &types.IndexSettings{
Index: &types.IndexSettings{
NumberOfShards: model.NewPointer(strconv.Itoa(*cfg.ElasticsearchSettings.PostIndexShards)),
NumberOfReplicas: model.NewPointer(strconv.Itoa(*cfg.ElasticsearchSettings.PostIndexReplicas)),
},
Analysis: &types.IndexSettingsAnalysis{
Tokenizer: map[string]types.Tokenizer{},
CharFilter: map[string]types.CharFilter{
"leading_underscores": map[string]any{
"type": "pattern_replace",
"pattern": `(^|[\s\r\n])_`,
"replacement": "$1",
},
"trailing_underscores": map[string]any{
"type": "pattern_replace",
"pattern": `_([\s\r\n]|$)`,
"replacement": "$1",
},
},
Analyzer: map[string]types.Analyzer{
"mm_lowercaser": map[string]any{
"tokenizer": "icu_tokenizer",
"filter": []string{
"icu_normalizer",
"mm_snowball",
"mm_stop",
},
"char_filter": []string{
"leading_underscores",
"trailing_underscores",
},
},
"mm_url": map[string]any{
"tokenizer": "pattern",
"pattern": "\\W",
"lowercase": true,
},
},
Filter: map[string]types.TokenFilter{
"mm_snowball": map[string]any{
"type": "snowball",
"language": "English",
},
"mm_stop": map[string]any{
"type": "stop",
"stopwords": "_english_",
},
},
Normalizer: map[string]types.Normalizer{
"mm_hashtag": map[string]any{
"type": "custom",
"char_filter": []string{},
"filter": []string{"lowercase", "icu_normalizer"},
},
},
},
},
Mappings: &types.TypeMapping{
Properties: map[string]types.Property{
"message": types.TextProperty{
Analyzer: model.NewPointer("mm_lowercaser"),
Type: "text",
},
"attachments": types.TextProperty{
Analyzer: model.NewPointer("mm_lowercaser"),
Type: "text",
},
"urls": types.TextProperty{
Analyzer: model.NewPointer("mm_url"),
Type: "text",
},
"hashtags": types.KeywordProperty{
Type: "keyword",
Normalizer: model.NewPointer("mm_hashtag"),
Store: model.NewPointer(true),
},
},
},
}
for _, opt := range opts {
opt(template)
}
return &putindextemplate.Request{
IndexPatterns: []string{*cfg.ElasticsearchSettings.IndexPrefix + IndexBasePosts + "*"},
Template: &types.IndexTemplateMapping{
Settings: &types.IndexSettings{
Index: &types.IndexSettings{
NumberOfShards: model.NewPointer(strconv.Itoa(*cfg.ElasticsearchSettings.PostIndexShards)),
NumberOfReplicas: model.NewPointer(strconv.Itoa(*cfg.ElasticsearchSettings.PostIndexReplicas)),
},
Analysis: &types.IndexSettingsAnalysis{
CharFilter: map[string]types.CharFilter{
"leading_underscores": map[string]any{
"type": "pattern_replace",
"pattern": `(^|[\s\r\n])_`,
"replacement": "$1",
},
"trailing_underscores": map[string]any{
"type": "pattern_replace",
"pattern": `_([\s\r\n]|$)`,
"replacement": "$1",
},
},
Analyzer: map[string]types.Analyzer{
"mm_lowercaser": map[string]any{
"tokenizer": "icu_tokenizer",
"filter": []string{
"icu_normalizer",
"mm_snowball",
"mm_stop",
},
"char_filter": []string{
"leading_underscores",
"trailing_underscores",
},
},
"mm_url": map[string]any{
"tokenizer": "pattern",
"pattern": "\\W",
"lowercase": true,
}},
Filter: map[string]types.TokenFilter{
"mm_snowball": map[string]any{
"type": "snowball",
"language": "English",
},
"mm_stop": map[string]any{
"type": "stop",
"stopwords": "_english_",
},
},
Normalizer: map[string]types.Normalizer{
"mm_hashtag": map[string]any{
"type": "custom",
"char_filter": []string{},
"filter": []string{"lowercase", "icu_normalizer"},
},
},
},
},
Mappings: mappings,
},
Template: template,
}
}

View file

@ -48,6 +48,10 @@ func (c *CommonTestSuite) TestSearchStore() {
})
}
func (c *CommonTestSuite) TestSearchStoreEnabledCJK() {
searchtest.TestSearchPostStoreEnabledCJK(c.T(), c.TH.App.Srv().Store())
}
func (c *CommonTestSuite) TestIndexPost() {
testCases := []struct {
Name string

View file

@ -164,9 +164,27 @@ func (es *ElasticsearchInterfaceImpl) Start() *model.AppError {
http.StatusInternalServerError).Wrap(err)
}
opts := []func(*types.IndexTemplateMapping){}
// Set up additional analyzers to use in the post index template if CJK analyzers are enabled
if *es.Platform.Config().ElasticsearchSettings.EnableCJKAnalyzers {
if slices.Contains(es.plugins, "analysis-nori") {
opts = append(opts, common.WithNoriAnalyzer())
}
if slices.Contains(es.plugins, "analysis-kuromoji") {
opts = append(opts, common.WithKuromojiAnalyzer())
}
if slices.Contains(es.plugins, "analysis-smartcn") {
opts = append(opts, common.WithSmartCNAnalyzer())
}
if len(opts) == 0 {
es.Platform.Log().Warn("EnableCJKAnalyzers is set but no CJK analyzer plugins found installed. Please review elasticsearch settings.")
}
}
// Set up posts index template.
_, err = es.client.API.Indices.PutIndexTemplate(*es.Platform.Config().ElasticsearchSettings.IndexPrefix + common.IndexBasePosts).
Request(common.GetPostTemplate(es.Platform.Config())).
Request(common.GetPostTemplate(es.Platform.Config(), opts...)).
Do(ctx)
if err != nil {
return model.NewAppError("Elasticsearch.start", "ent.elasticsearch.create_template_posts_if_not_exists.template_create_failed", map[string]any{"Backend": model.ElasticsearchSettingsESBackend}, "", http.StatusInternalServerError).Wrap(err)
@ -297,6 +315,30 @@ func (es *ElasticsearchInterfaceImpl) getPostIndexNames() ([]string, error) {
return postIndexes, nil
}
func (es *ElasticsearchInterfaceImpl) getFieldVariants(fieldName string, query string) []string {
variants := []string{fieldName}
if es.Platform.Config().ElasticsearchSettings.EnableCJKAnalyzers == nil ||
!*es.Platform.Config().ElasticsearchSettings.EnableCJKAnalyzers ||
!common.ContainsCJK(query) {
return variants
}
if slices.Contains(es.plugins, "analysis-nori") {
variants = append(variants, fieldName+".nori")
}
if slices.Contains(es.plugins, "analysis-kuromoji") {
variants = append(variants, fieldName+".kuromoji")
}
if slices.Contains(es.plugins, "analysis-smartcn") {
variants = append(variants, fieldName+".smartcn")
}
return variants
}
func (es *ElasticsearchInterfaceImpl) SearchPosts(channels model.ChannelList, searchParams []*model.SearchParams, page, perPage int) ([]string, model.PostSearchMatches, *model.AppError) {
es.mutex.RLock()
defer es.mutex.RUnlock()
@ -448,13 +490,13 @@ func (es *ElasticsearchInterfaceImpl) SearchPosts(channels model.ChannelList, se
{
SimpleQueryString: &types.SimpleQueryStringQuery{
Query: params.Terms,
Fields: []string{"message"},
Fields: es.getFieldVariants("message", params.Terms),
DefaultOperator: &termOperator,
},
}, {
SimpleQueryString: &types.SimpleQueryStringQuery{
Query: params.Terms,
Fields: []string{"attachments"},
Fields: es.getFieldVariants("attachments", params.Terms),
DefaultOperator: &termOperator,
},
}, {
@ -494,13 +536,13 @@ func (es *ElasticsearchInterfaceImpl) SearchPosts(channels model.ChannelList, se
{
SimpleQueryString: &types.SimpleQueryStringQuery{
Query: params.ExcludedTerms,
Fields: []string{"message"},
Fields: es.getFieldVariants("message", params.ExcludedTerms),
DefaultOperator: &termOperator,
},
}, {
SimpleQueryString: &types.SimpleQueryStringQuery{
Query: params.ExcludedTerms,
Fields: []string{"attachments"},
Fields: es.getFieldVariants("attachments", params.ExcludedTerms),
DefaultOperator: &termOperator,
},
}, {
@ -555,6 +597,7 @@ func (es *ElasticsearchInterfaceImpl) SearchPosts(channels model.ChannelList, se
},
)
// highlighting base fields should be enough even if CJK analyzers are enabled
highlight := &types.Highlight{
HighlightQuery: &types.Query{
Bool: fullHighlightsQuery,

View file

@ -7,6 +7,7 @@ import (
"bytes"
"context"
"encoding/json"
"strings"
"testing"
elastic "github.com/elastic/go-elasticsearch/v8"
@ -70,6 +71,7 @@ func (s *ElasticsearchInterfaceTestSuite) SetupSuite() {
s.th.App.UpdateConfig(func(cfg *model.Config) {
*cfg.ElasticsearchSettings.EnableIndexing = true
*cfg.ElasticsearchSettings.EnableSearching = true
*cfg.ElasticsearchSettings.EnableCJKAnalyzers = false
*cfg.ElasticsearchSettings.EnableAutocomplete = true
*cfg.ElasticsearchSettings.LiveIndexingBatchSize = 1
*cfg.SqlSettings.DisableDatabaseSearch = true
@ -89,6 +91,16 @@ func (s *ElasticsearchInterfaceTestSuite) SetupSuite() {
}
func (s *ElasticsearchInterfaceTestSuite) SetupTest() {
if strings.Contains(s.T().Name(), "CJK") {
s.th.App.UpdateConfig(func(cfg *model.Config) {
*cfg.ElasticsearchSettings.EnableCJKAnalyzers = true
})
} else {
s.th.App.UpdateConfig(func(cfg *model.Config) {
*cfg.ElasticsearchSettings.EnableCJKAnalyzers = false
})
}
s.CommonTestSuite.ESImpl = s.th.App.SearchEngine().ElasticsearchEngine
if s.CommonTestSuite.ESImpl.IsActive() {

View file

@ -153,8 +153,26 @@ func (os *OpensearchInterfaceImpl) Start() *model.AppError {
time.Duration(*esSettings.RequestTimeoutSeconds)*time.Second,
os.Platform.Log())
opts := []func(*types.IndexTemplateMapping){}
// Set up additional analyzers to use in the post index template if CJK analyzers are enabled
if *os.Platform.Config().ElasticsearchSettings.EnableCJKAnalyzers {
if slices.Contains(os.plugins, "analysis-nori") {
opts = append(opts, common.WithNoriAnalyzer())
}
if slices.Contains(os.plugins, "analysis-kuromoji") {
opts = append(opts, common.WithKuromojiAnalyzer())
}
if slices.Contains(os.plugins, "analysis-smartcn") {
opts = append(opts, common.WithSmartCNAnalyzer())
}
if len(opts) == 0 {
os.Platform.Log().Warn("EnableCJKAnalyzers is set but no CJK analyzer plugins found installed. Please review opensearch settings.")
}
}
// Set up posts index template.
templateBuf, err := json.Marshal(common.GetPostTemplate(os.Platform.Config()))
templateBuf, err := json.Marshal(common.GetPostTemplate(os.Platform.Config(), opts...))
if err != nil {
return model.NewAppError("Opensearch.start", "api.marshal_error", nil, "", http.StatusInternalServerError).Wrap(err)
}
@ -314,6 +332,30 @@ func (os *OpensearchInterfaceImpl) getPostIndexNames() ([]string, error) {
return postIndexes, nil
}
func (os *OpensearchInterfaceImpl) getFieldVariants(fieldName string, query string) []string {
variants := []string{fieldName}
if os.Platform.Config().ElasticsearchSettings.EnableCJKAnalyzers == nil ||
!*os.Platform.Config().ElasticsearchSettings.EnableCJKAnalyzers ||
!common.ContainsCJK(query) {
return variants
}
if slices.Contains(os.plugins, "analysis-nori") {
variants = append(variants, fieldName+".nori")
}
if slices.Contains(os.plugins, "analysis-kuromoji") {
variants = append(variants, fieldName+".kuromoji")
}
if slices.Contains(os.plugins, "analysis-smartcn") {
variants = append(variants, fieldName+".smartcn")
}
return variants
}
func (os *OpensearchInterfaceImpl) SearchPosts(channels model.ChannelList, searchParams []*model.SearchParams, page, perPage int) ([]string, model.PostSearchMatches, *model.AppError) {
os.mutex.RLock()
defer os.mutex.RUnlock()
@ -465,13 +507,13 @@ func (os *OpensearchInterfaceImpl) SearchPosts(channels model.ChannelList, searc
{
SimpleQueryString: &types.SimpleQueryStringQuery{
Query: params.Terms,
Fields: []string{"message"},
Fields: os.getFieldVariants("message", params.Terms),
DefaultOperator: &termOperator,
},
}, {
SimpleQueryString: &types.SimpleQueryStringQuery{
Query: params.Terms,
Fields: []string{"attachments"},
Fields: os.getFieldVariants("attachments", params.Terms),
DefaultOperator: &termOperator,
},
}, {
@ -511,13 +553,13 @@ func (os *OpensearchInterfaceImpl) SearchPosts(channels model.ChannelList, searc
{
SimpleQueryString: &types.SimpleQueryStringQuery{
Query: params.ExcludedTerms,
Fields: []string{"message"},
Fields: os.getFieldVariants("message", params.ExcludedTerms),
DefaultOperator: &termOperator,
},
}, {
SimpleQueryString: &types.SimpleQueryStringQuery{
Query: params.ExcludedTerms,
Fields: []string{"attachments"},
Fields: os.getFieldVariants("attachments", params.ExcludedTerms),
DefaultOperator: &termOperator,
},
}, {
@ -572,6 +614,7 @@ func (os *OpensearchInterfaceImpl) SearchPosts(channels model.ChannelList, searc
},
)
// highlighting base fields should be enough even if CJK analyzers are enabled
highlight := &types.Highlight{
HighlightQuery: &types.Query{
Bool: fullHighlightsQuery,

View file

@ -8,6 +8,7 @@ import (
"context"
"encoding/json"
"os"
"strings"
"testing"
"github.com/opensearch-project/opensearch-go/v4"
@ -115,6 +116,16 @@ func (s *OpensearchInterfaceTestSuite) TearDownSuite() {
}
func (s *OpensearchInterfaceTestSuite) SetupTest() {
if strings.Contains(s.T().Name(), "CJK") {
s.th.App.UpdateConfig(func(cfg *model.Config) {
*cfg.ElasticsearchSettings.EnableCJKAnalyzers = true
})
} else {
s.th.App.UpdateConfig(func(cfg *model.Config) {
*cfg.ElasticsearchSettings.EnableCJKAnalyzers = false
})
}
s.CommonTestSuite.ESImpl = s.th.App.SearchEngine().ElasticsearchEngine
if s.CommonTestSuite.ESImpl.IsActive() {

View file

@ -10052,6 +10052,10 @@
"id": "model.config.is_valid.elastic_search.enable_autocomplete.app_error",
"translation": "{{.EnableIndexing}} setting must be set to true when {{.Autocomplete}} is set to true"
},
{
"id": "model.config.is_valid.elastic_search.enable_cjk_analyzers.app_error",
"translation": "{{.Searching}} setting must be set to true when {{.EnableCJKAnalyzers}} is set to true"
},
{
"id": "model.config.is_valid.elastic_search.enable_searching.app_error",
"translation": "{{.EnableIndexing}} setting must be set to true when {{.Searching}} is set to true"

View file

@ -3122,6 +3122,7 @@ type ElasticsearchSettings struct {
Password *string `access:"environment_elasticsearch,write_restrictable,cloud_restrictable"`
EnableIndexing *bool `access:"environment_elasticsearch,write_restrictable,cloud_restrictable"`
EnableSearching *bool `access:"environment_elasticsearch,write_restrictable,cloud_restrictable"`
EnableCJKAnalyzers *bool `access:"environment_elasticsearch,write_restrictable,cloud_restrictable"`
EnableAutocomplete *bool `access:"environment_elasticsearch,write_restrictable,cloud_restrictable"`
Sniff *bool `access:"environment_elasticsearch,write_restrictable,cloud_restrictable"`
PostIndexReplicas *int `access:"environment_elasticsearch,write_restrictable,cloud_restrictable"`
@ -3183,6 +3184,10 @@ func (s *ElasticsearchSettings) SetDefaults() {
s.EnableSearching = NewPointer(false)
}
if s.EnableCJKAnalyzers == nil {
s.EnableCJKAnalyzers = NewPointer(false)
}
if s.EnableAutocomplete == nil {
s.EnableAutocomplete = NewPointer(false)
}
@ -4728,6 +4733,13 @@ func (s *ElasticsearchSettings) isValid() *AppError {
}, "", http.StatusBadRequest)
}
if *s.EnableCJKAnalyzers && !*s.EnableSearching {
return NewAppError("Config.IsValid", "model.config.is_valid.elastic_search.enable_cjk_analyzers.app_error", map[string]any{
"EnableCJKAnalyzers": "ElasticsearchSettings.EnableCJKAnalyzers",
"Searching": "ElasticsearchSettings.EnableSearching",
}, "", http.StatusBadRequest)
}
if *s.EnableAutocomplete && !*s.EnableIndexing {
return NewAppError("Config.IsValid", "model.config.is_valid.elastic_search.enable_autocomplete.app_error", map[string]any{
"Autocomplete": "ElasticsearchSettings.EnableAutocomplete",

View file

@ -888,6 +888,7 @@ export type ElasticsearchSettings = {
Password: string;
EnableIndexing: boolean;
EnableSearching: boolean;
EnableCJKAnalyzers: boolean;
EnableAutocomplete: boolean;
Sniff: boolean;
PostIndexReplicas: number;