diff --git a/e2e-tests/cypress/tests/support/api/on_prem_default_config.json b/e2e-tests/cypress/tests/support/api/on_prem_default_config.json index 3680a93ce31..fafdacd10d2 100644 --- a/e2e-tests/cypress/tests/support/api/on_prem_default_config.json +++ b/e2e-tests/cypress/tests/support/api/on_prem_default_config.json @@ -484,6 +484,7 @@ "Password": "changeme", "EnableIndexing": false, "EnableSearching": false, + "EnableCJKAnalyzers": false, "EnableAutocomplete": false, "Sniff": false, "PostIndexReplicas": 1, diff --git a/e2e-tests/playwright/lib/src/server/default_config.ts b/e2e-tests/playwright/lib/src/server/default_config.ts index eaeb40c7417..073023408c6 100644 --- a/e2e-tests/playwright/lib/src/server/default_config.ts +++ b/e2e-tests/playwright/lib/src/server/default_config.ts @@ -627,6 +627,7 @@ const defaultServerConfig: AdminConfig = { Password: 'changeme', EnableIndexing: false, EnableSearching: false, + EnableCJKAnalyzers: false, EnableAutocomplete: false, Sniff: true, PostIndexReplicas: 1, diff --git a/server/.golangci.yml b/server/.golangci.yml index a5eeddc21d8..d3f09acf71a 100644 --- a/server/.golangci.yml +++ b/server/.golangci.yml @@ -112,6 +112,7 @@ linters: channels/store/localcachelayer/webhook_layer_test.go|\ channels/store/retrylayer/retrylayer_test.go|\ channels/store/searchtest/channel_layer.go|\ + channels/store/searchtest/cjk_plugins.go|\ channels/store/searchtest/file_info_layer.go|\ channels/store/searchtest/helper.go|\ channels/store/searchtest/post_layer.go|\ diff --git a/server/build/Dockerfile.elasticsearch b/server/build/Dockerfile.elasticsearch new file mode 100644 index 00000000000..161ed4645f7 --- /dev/null +++ b/server/build/Dockerfile.elasticsearch @@ -0,0 +1,4 @@ +ARG ELASTICSEARCH_VERSION=8.9.0 +FROM mattermostdevelopment/mattermost-elasticsearch:${ELASTICSEARCH_VERSION} + +RUN /usr/share/elasticsearch/bin/elasticsearch-plugin install --batch analysis-nori analysis-kuromoji analysis-smartcn \ No newline at end of file diff --git a/server/build/Dockerfile.opensearch b/server/build/Dockerfile.opensearch index fe3a44bab5e..b0363e24bb1 100644 --- a/server/build/Dockerfile.opensearch +++ b/server/build/Dockerfile.opensearch @@ -1,4 +1,4 @@ ARG OPENSEARCH_VERSION=2.7.0 FROM opensearchproject/opensearch:$OPENSEARCH_VERSION -RUN /usr/share/opensearch/bin/opensearch-plugin install analysis-icu +RUN /usr/share/opensearch/bin/opensearch-plugin install analysis-icu analysis-nori analysis-kuromoji analysis-smartcn diff --git a/server/build/docker-compose.common.yml b/server/build/docker-compose.common.yml index 41bad713572..407a6157def 100644 --- a/server/build/docker-compose.common.yml +++ b/server/build/docker-compose.common.yml @@ -47,7 +47,9 @@ services: LDAP_DOMAIN: "mm.test.com" LDAP_ADMIN_PASSWORD: "mostest" elasticsearch: - image: "mattermostdevelopment/mattermost-elasticsearch:8.9.0" + build: + context: . + dockerfile: ./Dockerfile.elasticsearch networks: - mm-test environment: diff --git a/server/channels/store/searchtest/cjk_plugins.go b/server/channels/store/searchtest/cjk_plugins.go new file mode 100644 index 00000000000..bddb91ff721 --- /dev/null +++ b/server/channels/store/searchtest/cjk_plugins.go @@ -0,0 +1,253 @@ +// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved. +// See LICENSE.txt for license information. + +package searchtest + +import ( + "testing" + + "github.com/mattermost/mattermost/server/public/model" + "github.com/mattermost/mattermost/server/public/shared/request" + "github.com/mattermost/mattermost/server/v8/channels/store" + "github.com/stretchr/testify/require" +) + +func TestSearchPostStoreEnabledCJK(t *testing.T, s store.Store) { + th := &SearchTestHelper{ + Context: request.TestContext(t), + Store: s, + } + err := th.SetupBasicFixtures() + require.NoError(t, err) + defer th.CleanFixtures() + + t.Run("Korean searches using nori analyzer", func(t *testing.T) { + t.Run("should be able to search with wildcard and exact search", func(t *testing.T) { + p1, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "한글", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + p2, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "한국", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + defer th.deleteUserPosts(th.User.Id) + + // Exact search + params := &model.SearchParams{Terms: "한글"} + results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20) + require.NoError(t, err) + require.Len(t, results.Posts, 1) + th.checkPostInSearchResults(t, p1.Id, results.Posts) + + // Wildcard search + params = &model.SearchParams{Terms: "한*"} + results, err = th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20) + require.NoError(t, err) + require.Len(t, results.Posts, 2) + th.checkPostInSearchResults(t, p1.Id, results.Posts) + th.checkPostInSearchResults(t, p2.Id, results.Posts) + }) + + t.Run("should search one word and phrase with Nori segmentation", func(t *testing.T) { + pBul, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "불", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + pBulda, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "불다", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + p3, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "소고기덮밥", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + p4, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "치킨덮밥", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + defer th.deleteUserPosts(th.User.Id) + + // One word "불": Nori segments "불다" into "불"+"다", so "불" matches both posts + params := &model.SearchParams{Terms: "불"} + results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20) + require.NoError(t, err) + require.Len(t, results.Posts, 2) + th.checkPostInSearchResults(t, pBul.Id, results.Posts) + th.checkPostInSearchResults(t, pBulda.Id, results.Posts) + + // Unquoted "불다": SimpleQueryString treats one term as OR of analyzed tokens (불 OR 다), so both posts match + params = &model.SearchParams{Terms: "불다"} + results, err = th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20) + require.NoError(t, err) + require.Len(t, results.Posts, 2) + th.checkPostInSearchResults(t, pBul.Id, results.Posts) + th.checkPostInSearchResults(t, pBulda.Id, results.Posts) + + // Unquoted 덮밥 should match 소고기덮밥 and 치킨덮밥 + params = &model.SearchParams{Terms: "덮밥"} + results, err = th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20) + require.NoError(t, err) + require.Len(t, results.Posts, 2) + th.checkPostInSearchResults(t, p3.Id, results.Posts) + th.checkPostInSearchResults(t, p4.Id, results.Posts) + }) + + t.Run("should search in mixed Korean and English content", func(t *testing.T) { + p5, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "오늘 회의실 예약 meeting", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + defer th.deleteUserPosts(th.User.Id) + + params := &model.SearchParams{Terms: "회의실"} + results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20) + require.NoError(t, err) + require.Len(t, results.Posts, 1) + th.checkPostInSearchResults(t, p5.Id, results.Posts) + }) + + t.Run("should search using phrase search", func(t *testing.T) { + p6, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "오늘 회의실 예약", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + defer th.deleteUserPosts(th.User.Id) + + params := &model.SearchParams{Terms: "\"오늘 회의실\""} + results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20) + require.NoError(t, err) + require.Len(t, results.Posts, 1) + th.checkPostInSearchResults(t, p6.Id, results.Posts) + }) + }) + + t.Run("Japanese searches using kuromoji analyzer", func(t *testing.T) { + t.Run("should be able to search using wildcard and exact search", func(t *testing.T) { + p1, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "東京", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + p2, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "東北", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + defer th.deleteUserPosts(th.User.Id) + + // Exact match + params := &model.SearchParams{Terms: "東京"} + results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20) + require.NoError(t, err) + require.Len(t, results.Posts, 1) + th.checkPostInSearchResults(t, p1.Id, results.Posts) + + // Wildcard search + params = &model.SearchParams{Terms: "東*"} + results, err = th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20) + require.NoError(t, err) + require.Len(t, results.Posts, 2) + th.checkPostInSearchResults(t, p1.Id, results.Posts) + th.checkPostInSearchResults(t, p2.Id, results.Posts) + }) + + t.Run("should search in mixed Japanese and English content", func(t *testing.T) { + p4, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "projectの締め切りは来週", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + defer th.deleteUserPosts(th.User.Id) + + params := &model.SearchParams{Terms: "締め切り"} + results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20) + require.NoError(t, err) + require.Len(t, results.Posts, 1) + th.checkPostInSearchResults(t, p4.Id, results.Posts) + }) + + t.Run("should search using phrase search", func(t *testing.T) { + p3, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "今日の会議は中止です", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + defer th.deleteUserPosts(th.User.Id) + + params := &model.SearchParams{Terms: "\"今日の会議\""} + results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20) + require.NoError(t, err) + require.Len(t, results.Posts, 1) + th.checkPostInSearchResults(t, p3.Id, results.Posts) + }) + + t.Run("should find conjugated verb forms when searching infinitive", func(t *testing.T) { + // Create posts with different verb conjugations + pInfinitive, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "食べる", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + pPast, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "昨日ラーメンを食べました", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + pTe, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "食べている", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + pNegative, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "食べない", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + defer th.deleteUserPosts(th.User.Id) + + // Search for infinitive form should find all conjugated forms + params := &model.SearchParams{Terms: "食べる"} + results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20) + require.NoError(t, err) + require.Len(t, results.Posts, 4) + th.checkPostInSearchResults(t, pInfinitive.Id, results.Posts) + th.checkPostInSearchResults(t, pPast.Id, results.Posts) + th.checkPostInSearchResults(t, pTe.Id, results.Posts) + th.checkPostInSearchResults(t, pNegative.Id, results.Posts) + }) + }) + + t.Run("Chinese searches using smartcn analyzer", func(t *testing.T) { + t.Run("should be able to search using wildcard and exact search", func(t *testing.T) { + p1, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "电脑", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + p2, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "电话", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + defer th.deleteUserPosts(th.User.Id) + + // Exact search + params := &model.SearchParams{Terms: "电脑"} + results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20) + require.NoError(t, err) + require.Len(t, results.Posts, 1) + th.checkPostInSearchResults(t, p1.Id, results.Posts) + + // Wildcard search + params = &model.SearchParams{Terms: "电*"} + results, err = th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20) + require.NoError(t, err) + require.Len(t, results.Posts, 2) + th.checkPostInSearchResults(t, p1.Id, results.Posts) + th.checkPostInSearchResults(t, p2.Id, results.Posts) + }) + + t.Run("should search one and two characters with SmartCN segmentation (你 / 你好)", func(t *testing.T) { + pNiHao, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "你好", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + pNi, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "你", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + defer th.deleteUserPosts(th.User.Id) + + // One character: SmartCN segments "你好" into "你"+"好", so "你" matches both + params := &model.SearchParams{Terms: "你"} + results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20) + require.NoError(t, err) + require.Len(t, results.Posts, 2) + th.checkPostInSearchResults(t, pNi.Id, results.Posts) + th.checkPostInSearchResults(t, pNiHao.Id, results.Posts) + + // Two characters (unquoted): SimpleQueryString matches any analyzed token (你 OR 好), so both posts match + params = &model.SearchParams{Terms: "你好"} + results, err = th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20) + require.NoError(t, err) + require.Len(t, results.Posts, 2) + th.checkPostInSearchResults(t, pNi.Id, results.Posts) + th.checkPostInSearchResults(t, pNiHao.Id, results.Posts) + }) + + t.Run("should search in mixed Chinese and English content", func(t *testing.T) { + p3, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "this is 今天开会讨论API接口 content", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + defer th.deleteUserPosts(th.User.Id) + + params := &model.SearchParams{Terms: "接口"} + results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20) + require.NoError(t, err) + require.Len(t, results.Posts, 1) + th.checkPostInSearchResults(t, p3.Id, results.Posts) + }) + + t.Run("should search using phrase search", func(t *testing.T) { + p4, err := th.createPost(th.User.Id, th.ChannelBasic.Id, "this is 今天开会讨论API接口 content", "", model.PostTypeDefault, 0, false) + require.NoError(t, err) + defer th.deleteUserPosts(th.User.Id) + + params := &model.SearchParams{Terms: "\"今天开会\""} + results, err := th.Store.Post().SearchPostsForUser(th.Context, []*model.SearchParams{params}, th.User.Id, th.Team.Id, 0, 20) + require.NoError(t, err) + require.Len(t, results.Posts, 1) + th.checkPostInSearchResults(t, p4.Id, results.Posts) + }) + }) +} diff --git a/server/docker-compose.makefile.m1.yml b/server/docker-compose.makefile.m1.yml index 9d1a646990b..f381ee11032 100644 --- a/server/docker-compose.makefile.m1.yml +++ b/server/docker-compose.makefile.m1.yml @@ -1,6 +1,5 @@ services: elasticsearch: - image: "mattermostdevelopment/mattermost-elasticsearch:8.9.0" platform: linux/arm64/v8 restart: 'no' container_name: mattermost-elasticsearch diff --git a/server/enterprise/elasticsearch/common/common.go b/server/enterprise/elasticsearch/common/common.go index 3070f212739..451611e8a09 100644 --- a/server/enterprise/elasticsearch/common/common.go +++ b/server/enterprise/elasticsearch/common/common.go @@ -12,6 +12,7 @@ import ( "runtime" "strings" "time" + "unicode" "github.com/mattermost/mattermost/server/public/model" "github.com/mattermost/mattermost/server/v8/platform/services/searchengine" @@ -395,3 +396,16 @@ func GetMatchesForHit(highlights map[string][]string) ([]string, error) { return matches, nil } + +func ContainsCJK(s string) bool { + for _, r := range s { + if unicode.Is(unicode.Han, r) || // Chinese characters (also used in Japanese) + unicode.Is(unicode.Hangul, r) || // Korean + unicode.Is(unicode.Hiragana, r) || // Japanese + unicode.Is(unicode.Katakana, r) { // Japanese + return true + } + } + + return false +} diff --git a/server/enterprise/elasticsearch/common/templates.go b/server/enterprise/elasticsearch/common/templates.go index 9aae56fb38f..e8576b36174 100644 --- a/server/enterprise/elasticsearch/common/templates.go +++ b/server/enterprise/elasticsearch/common/templates.go @@ -9,91 +9,191 @@ import ( "github.com/elastic/go-elasticsearch/v8/typedapi/indices/putindextemplate" "github.com/elastic/go-elasticsearch/v8/typedapi/types" "github.com/mattermost/mattermost/server/public/model" + "github.com/mattermost/mattermost/server/public/shared/mlog" ) -func GetPostTemplate(cfg *model.Config) *putindextemplate.Request { - mappings := &types.TypeMapping{ - Properties: map[string]types.Property{ - "message": types.TextProperty{ - Analyzer: model.NewPointer("mm_lowercaser"), - Type: "text", +func addQueryProperty(mappings *types.TypeMapping, propertyName string, analyzer string) { + // property name must be a text property due to default mappings + if textProp, ok := mappings.Properties[propertyName].(types.TextProperty); ok { + if textProp.Fields == nil { + textProp.Fields = make(map[string]types.Property) + } + + textProp.Fields[analyzer] = types.TextProperty{ + Analyzer: model.NewPointer("mm_" + analyzer), + Type: "text", + } + + mappings.Properties[propertyName] = textProp + } +} + +func WithNoriAnalyzer() func(template *types.IndexTemplateMapping) { + return func(template *types.IndexTemplateMapping) { + mlog.Info("using nori analyzer") + + addQueryProperty(template.Mappings, "message", "nori") + addQueryProperty(template.Mappings, "attachments", "nori") + + template.Settings.Analysis.Tokenizer["nori_tokenizer"] = map[string]any{ + "type": "nori_tokenizer", + "decompound_mode": "mixed", + } + + template.Settings.Analysis.Analyzer["mm_nori"] = map[string]any{ + "char_filter": []string{ + "leading_underscores", + "trailing_underscores", }, - "attachments": types.TextProperty{ - Analyzer: model.NewPointer("mm_lowercaser"), - Type: "text", + "tokenizer": "nori_tokenizer", + "filter": []string{ + "nori_readingform", + "nori_part_of_speech", + "lowercase", }, - "urls": types.TextProperty{ - Analyzer: model.NewPointer("mm_url"), - Type: "text", + } + } +} + +func WithKuromojiAnalyzer() func(template *types.IndexTemplateMapping) { + return func(template *types.IndexTemplateMapping) { + mlog.Info("using kuromoji analyzer") + + addQueryProperty(template.Mappings, "message", "kuromoji") + addQueryProperty(template.Mappings, "attachments", "kuromoji") + + template.Settings.Analysis.Tokenizer["kuromoji_tokenizer"] = map[string]any{ + "type": "kuromoji_tokenizer", + "mode": "search", + } + + template.Settings.Analysis.Analyzer["mm_kuromoji"] = map[string]any{ + "char_filter": []string{ + "leading_underscores", + "trailing_underscores", }, - "hashtags": types.KeywordProperty{ - Type: "keyword", - Normalizer: model.NewPointer("mm_hashtag"), - Store: model.NewPointer(true), + "tokenizer": "kuromoji_tokenizer", + "filter": []string{ + "kuromoji_baseform", + "kuromoji_part_of_speech", + "ja_stop", + "kuromoji_stemmer", + "lowercase", + }, + } + } +} + +func WithSmartCNAnalyzer() func(template *types.IndexTemplateMapping) { + return func(template *types.IndexTemplateMapping) { + mlog.Info("using smartcn analyzer") + + addQueryProperty(template.Mappings, "message", "smartcn") + addQueryProperty(template.Mappings, "attachments", "smartcn") + + template.Settings.Analysis.Analyzer["mm_smartcn"] = map[string]any{ + "char_filter": []string{ + "leading_underscores", + "trailing_underscores", + }, + "tokenizer": "smartcn_tokenizer", + "filter": []string{ + "smartcn_stop", + "lowercase", + }, + } + } +} + +func GetPostTemplate(cfg *model.Config, opts ...func(*types.IndexTemplateMapping)) *putindextemplate.Request { + template := &types.IndexTemplateMapping{ + Settings: &types.IndexSettings{ + Index: &types.IndexSettings{ + NumberOfShards: model.NewPointer(strconv.Itoa(*cfg.ElasticsearchSettings.PostIndexShards)), + NumberOfReplicas: model.NewPointer(strconv.Itoa(*cfg.ElasticsearchSettings.PostIndexReplicas)), + }, + Analysis: &types.IndexSettingsAnalysis{ + Tokenizer: map[string]types.Tokenizer{}, + CharFilter: map[string]types.CharFilter{ + "leading_underscores": map[string]any{ + "type": "pattern_replace", + "pattern": `(^|[\s\r\n])_`, + "replacement": "$1", + }, + "trailing_underscores": map[string]any{ + "type": "pattern_replace", + "pattern": `_([\s\r\n]|$)`, + "replacement": "$1", + }, + }, + Analyzer: map[string]types.Analyzer{ + "mm_lowercaser": map[string]any{ + "tokenizer": "icu_tokenizer", + "filter": []string{ + "icu_normalizer", + "mm_snowball", + "mm_stop", + }, + "char_filter": []string{ + "leading_underscores", + "trailing_underscores", + }, + }, + "mm_url": map[string]any{ + "tokenizer": "pattern", + "pattern": "\\W", + "lowercase": true, + }, + }, + Filter: map[string]types.TokenFilter{ + "mm_snowball": map[string]any{ + "type": "snowball", + "language": "English", + }, + "mm_stop": map[string]any{ + "type": "stop", + "stopwords": "_english_", + }, + }, + Normalizer: map[string]types.Normalizer{ + "mm_hashtag": map[string]any{ + "type": "custom", + "char_filter": []string{}, + "filter": []string{"lowercase", "icu_normalizer"}, + }, + }, + }, + }, + Mappings: &types.TypeMapping{ + Properties: map[string]types.Property{ + "message": types.TextProperty{ + Analyzer: model.NewPointer("mm_lowercaser"), + Type: "text", + }, + "attachments": types.TextProperty{ + Analyzer: model.NewPointer("mm_lowercaser"), + Type: "text", + }, + "urls": types.TextProperty{ + Analyzer: model.NewPointer("mm_url"), + Type: "text", + }, + "hashtags": types.KeywordProperty{ + Type: "keyword", + Normalizer: model.NewPointer("mm_hashtag"), + Store: model.NewPointer(true), + }, }, }, } + for _, opt := range opts { + opt(template) + } + return &putindextemplate.Request{ IndexPatterns: []string{*cfg.ElasticsearchSettings.IndexPrefix + IndexBasePosts + "*"}, - Template: &types.IndexTemplateMapping{ - Settings: &types.IndexSettings{ - Index: &types.IndexSettings{ - NumberOfShards: model.NewPointer(strconv.Itoa(*cfg.ElasticsearchSettings.PostIndexShards)), - NumberOfReplicas: model.NewPointer(strconv.Itoa(*cfg.ElasticsearchSettings.PostIndexReplicas)), - }, - Analysis: &types.IndexSettingsAnalysis{ - CharFilter: map[string]types.CharFilter{ - "leading_underscores": map[string]any{ - "type": "pattern_replace", - "pattern": `(^|[\s\r\n])_`, - "replacement": "$1", - }, - "trailing_underscores": map[string]any{ - "type": "pattern_replace", - "pattern": `_([\s\r\n]|$)`, - "replacement": "$1", - }, - }, - Analyzer: map[string]types.Analyzer{ - "mm_lowercaser": map[string]any{ - "tokenizer": "icu_tokenizer", - "filter": []string{ - "icu_normalizer", - "mm_snowball", - "mm_stop", - }, - "char_filter": []string{ - "leading_underscores", - "trailing_underscores", - }, - }, - "mm_url": map[string]any{ - "tokenizer": "pattern", - "pattern": "\\W", - "lowercase": true, - }}, - Filter: map[string]types.TokenFilter{ - "mm_snowball": map[string]any{ - "type": "snowball", - "language": "English", - }, - "mm_stop": map[string]any{ - "type": "stop", - "stopwords": "_english_", - }, - }, - Normalizer: map[string]types.Normalizer{ - "mm_hashtag": map[string]any{ - "type": "custom", - "char_filter": []string{}, - "filter": []string{"lowercase", "icu_normalizer"}, - }, - }, - }, - }, - Mappings: mappings, - }, + Template: template, } } diff --git a/server/enterprise/elasticsearch/common/test_suite.go b/server/enterprise/elasticsearch/common/test_suite.go index 9a459ea916f..c90f86d3676 100644 --- a/server/enterprise/elasticsearch/common/test_suite.go +++ b/server/enterprise/elasticsearch/common/test_suite.go @@ -48,6 +48,10 @@ func (c *CommonTestSuite) TestSearchStore() { }) } +func (c *CommonTestSuite) TestSearchStoreEnabledCJK() { + searchtest.TestSearchPostStoreEnabledCJK(c.T(), c.TH.App.Srv().Store()) +} + func (c *CommonTestSuite) TestIndexPost() { testCases := []struct { Name string diff --git a/server/enterprise/elasticsearch/elasticsearch/elasticsearch.go b/server/enterprise/elasticsearch/elasticsearch/elasticsearch.go index bcaf285dcc3..e68c1134d04 100644 --- a/server/enterprise/elasticsearch/elasticsearch/elasticsearch.go +++ b/server/enterprise/elasticsearch/elasticsearch/elasticsearch.go @@ -164,9 +164,27 @@ func (es *ElasticsearchInterfaceImpl) Start() *model.AppError { http.StatusInternalServerError).Wrap(err) } + opts := []func(*types.IndexTemplateMapping){} + // Set up additional analyzers to use in the post index template if CJK analyzers are enabled + if *es.Platform.Config().ElasticsearchSettings.EnableCJKAnalyzers { + if slices.Contains(es.plugins, "analysis-nori") { + opts = append(opts, common.WithNoriAnalyzer()) + } + if slices.Contains(es.plugins, "analysis-kuromoji") { + opts = append(opts, common.WithKuromojiAnalyzer()) + } + if slices.Contains(es.plugins, "analysis-smartcn") { + opts = append(opts, common.WithSmartCNAnalyzer()) + } + + if len(opts) == 0 { + es.Platform.Log().Warn("EnableCJKAnalyzers is set but no CJK analyzer plugins found installed. Please review elasticsearch settings.") + } + } + // Set up posts index template. _, err = es.client.API.Indices.PutIndexTemplate(*es.Platform.Config().ElasticsearchSettings.IndexPrefix + common.IndexBasePosts). - Request(common.GetPostTemplate(es.Platform.Config())). + Request(common.GetPostTemplate(es.Platform.Config(), opts...)). Do(ctx) if err != nil { return model.NewAppError("Elasticsearch.start", "ent.elasticsearch.create_template_posts_if_not_exists.template_create_failed", map[string]any{"Backend": model.ElasticsearchSettingsESBackend}, "", http.StatusInternalServerError).Wrap(err) @@ -297,6 +315,30 @@ func (es *ElasticsearchInterfaceImpl) getPostIndexNames() ([]string, error) { return postIndexes, nil } +func (es *ElasticsearchInterfaceImpl) getFieldVariants(fieldName string, query string) []string { + variants := []string{fieldName} + + if es.Platform.Config().ElasticsearchSettings.EnableCJKAnalyzers == nil || + !*es.Platform.Config().ElasticsearchSettings.EnableCJKAnalyzers || + !common.ContainsCJK(query) { + return variants + } + + if slices.Contains(es.plugins, "analysis-nori") { + variants = append(variants, fieldName+".nori") + } + + if slices.Contains(es.plugins, "analysis-kuromoji") { + variants = append(variants, fieldName+".kuromoji") + } + + if slices.Contains(es.plugins, "analysis-smartcn") { + variants = append(variants, fieldName+".smartcn") + } + + return variants +} + func (es *ElasticsearchInterfaceImpl) SearchPosts(channels model.ChannelList, searchParams []*model.SearchParams, page, perPage int) ([]string, model.PostSearchMatches, *model.AppError) { es.mutex.RLock() defer es.mutex.RUnlock() @@ -448,13 +490,13 @@ func (es *ElasticsearchInterfaceImpl) SearchPosts(channels model.ChannelList, se { SimpleQueryString: &types.SimpleQueryStringQuery{ Query: params.Terms, - Fields: []string{"message"}, + Fields: es.getFieldVariants("message", params.Terms), DefaultOperator: &termOperator, }, }, { SimpleQueryString: &types.SimpleQueryStringQuery{ Query: params.Terms, - Fields: []string{"attachments"}, + Fields: es.getFieldVariants("attachments", params.Terms), DefaultOperator: &termOperator, }, }, { @@ -494,13 +536,13 @@ func (es *ElasticsearchInterfaceImpl) SearchPosts(channels model.ChannelList, se { SimpleQueryString: &types.SimpleQueryStringQuery{ Query: params.ExcludedTerms, - Fields: []string{"message"}, + Fields: es.getFieldVariants("message", params.ExcludedTerms), DefaultOperator: &termOperator, }, }, { SimpleQueryString: &types.SimpleQueryStringQuery{ Query: params.ExcludedTerms, - Fields: []string{"attachments"}, + Fields: es.getFieldVariants("attachments", params.ExcludedTerms), DefaultOperator: &termOperator, }, }, { @@ -555,6 +597,7 @@ func (es *ElasticsearchInterfaceImpl) SearchPosts(channels model.ChannelList, se }, ) + // highlighting base fields should be enough even if CJK analyzers are enabled highlight := &types.Highlight{ HighlightQuery: &types.Query{ Bool: fullHighlightsQuery, diff --git a/server/enterprise/elasticsearch/elasticsearch/elasticsearch_test.go b/server/enterprise/elasticsearch/elasticsearch/elasticsearch_test.go index 3d193bc4b35..f3b20420d63 100644 --- a/server/enterprise/elasticsearch/elasticsearch/elasticsearch_test.go +++ b/server/enterprise/elasticsearch/elasticsearch/elasticsearch_test.go @@ -7,6 +7,7 @@ import ( "bytes" "context" "encoding/json" + "strings" "testing" elastic "github.com/elastic/go-elasticsearch/v8" @@ -70,6 +71,7 @@ func (s *ElasticsearchInterfaceTestSuite) SetupSuite() { s.th.App.UpdateConfig(func(cfg *model.Config) { *cfg.ElasticsearchSettings.EnableIndexing = true *cfg.ElasticsearchSettings.EnableSearching = true + *cfg.ElasticsearchSettings.EnableCJKAnalyzers = false *cfg.ElasticsearchSettings.EnableAutocomplete = true *cfg.ElasticsearchSettings.LiveIndexingBatchSize = 1 *cfg.SqlSettings.DisableDatabaseSearch = true @@ -89,6 +91,16 @@ func (s *ElasticsearchInterfaceTestSuite) SetupSuite() { } func (s *ElasticsearchInterfaceTestSuite) SetupTest() { + if strings.Contains(s.T().Name(), "CJK") { + s.th.App.UpdateConfig(func(cfg *model.Config) { + *cfg.ElasticsearchSettings.EnableCJKAnalyzers = true + }) + } else { + s.th.App.UpdateConfig(func(cfg *model.Config) { + *cfg.ElasticsearchSettings.EnableCJKAnalyzers = false + }) + } + s.CommonTestSuite.ESImpl = s.th.App.SearchEngine().ElasticsearchEngine if s.CommonTestSuite.ESImpl.IsActive() { diff --git a/server/enterprise/elasticsearch/opensearch/opensearch.go b/server/enterprise/elasticsearch/opensearch/opensearch.go index 1a542fa450d..7743d58ebc6 100644 --- a/server/enterprise/elasticsearch/opensearch/opensearch.go +++ b/server/enterprise/elasticsearch/opensearch/opensearch.go @@ -153,8 +153,26 @@ func (os *OpensearchInterfaceImpl) Start() *model.AppError { time.Duration(*esSettings.RequestTimeoutSeconds)*time.Second, os.Platform.Log()) + opts := []func(*types.IndexTemplateMapping){} + // Set up additional analyzers to use in the post index template if CJK analyzers are enabled + if *os.Platform.Config().ElasticsearchSettings.EnableCJKAnalyzers { + if slices.Contains(os.plugins, "analysis-nori") { + opts = append(opts, common.WithNoriAnalyzer()) + } + if slices.Contains(os.plugins, "analysis-kuromoji") { + opts = append(opts, common.WithKuromojiAnalyzer()) + } + if slices.Contains(os.plugins, "analysis-smartcn") { + opts = append(opts, common.WithSmartCNAnalyzer()) + } + + if len(opts) == 0 { + os.Platform.Log().Warn("EnableCJKAnalyzers is set but no CJK analyzer plugins found installed. Please review opensearch settings.") + } + } + // Set up posts index template. - templateBuf, err := json.Marshal(common.GetPostTemplate(os.Platform.Config())) + templateBuf, err := json.Marshal(common.GetPostTemplate(os.Platform.Config(), opts...)) if err != nil { return model.NewAppError("Opensearch.start", "api.marshal_error", nil, "", http.StatusInternalServerError).Wrap(err) } @@ -314,6 +332,30 @@ func (os *OpensearchInterfaceImpl) getPostIndexNames() ([]string, error) { return postIndexes, nil } +func (os *OpensearchInterfaceImpl) getFieldVariants(fieldName string, query string) []string { + variants := []string{fieldName} + + if os.Platform.Config().ElasticsearchSettings.EnableCJKAnalyzers == nil || + !*os.Platform.Config().ElasticsearchSettings.EnableCJKAnalyzers || + !common.ContainsCJK(query) { + return variants + } + + if slices.Contains(os.plugins, "analysis-nori") { + variants = append(variants, fieldName+".nori") + } + + if slices.Contains(os.plugins, "analysis-kuromoji") { + variants = append(variants, fieldName+".kuromoji") + } + + if slices.Contains(os.plugins, "analysis-smartcn") { + variants = append(variants, fieldName+".smartcn") + } + + return variants +} + func (os *OpensearchInterfaceImpl) SearchPosts(channels model.ChannelList, searchParams []*model.SearchParams, page, perPage int) ([]string, model.PostSearchMatches, *model.AppError) { os.mutex.RLock() defer os.mutex.RUnlock() @@ -465,13 +507,13 @@ func (os *OpensearchInterfaceImpl) SearchPosts(channels model.ChannelList, searc { SimpleQueryString: &types.SimpleQueryStringQuery{ Query: params.Terms, - Fields: []string{"message"}, + Fields: os.getFieldVariants("message", params.Terms), DefaultOperator: &termOperator, }, }, { SimpleQueryString: &types.SimpleQueryStringQuery{ Query: params.Terms, - Fields: []string{"attachments"}, + Fields: os.getFieldVariants("attachments", params.Terms), DefaultOperator: &termOperator, }, }, { @@ -511,13 +553,13 @@ func (os *OpensearchInterfaceImpl) SearchPosts(channels model.ChannelList, searc { SimpleQueryString: &types.SimpleQueryStringQuery{ Query: params.ExcludedTerms, - Fields: []string{"message"}, + Fields: os.getFieldVariants("message", params.ExcludedTerms), DefaultOperator: &termOperator, }, }, { SimpleQueryString: &types.SimpleQueryStringQuery{ Query: params.ExcludedTerms, - Fields: []string{"attachments"}, + Fields: os.getFieldVariants("attachments", params.ExcludedTerms), DefaultOperator: &termOperator, }, }, { @@ -572,6 +614,7 @@ func (os *OpensearchInterfaceImpl) SearchPosts(channels model.ChannelList, searc }, ) + // highlighting base fields should be enough even if CJK analyzers are enabled highlight := &types.Highlight{ HighlightQuery: &types.Query{ Bool: fullHighlightsQuery, diff --git a/server/enterprise/elasticsearch/opensearch/opensearch_test.go b/server/enterprise/elasticsearch/opensearch/opensearch_test.go index 57ee0a1f5c8..6fe6d0a5a8e 100644 --- a/server/enterprise/elasticsearch/opensearch/opensearch_test.go +++ b/server/enterprise/elasticsearch/opensearch/opensearch_test.go @@ -8,6 +8,7 @@ import ( "context" "encoding/json" "os" + "strings" "testing" "github.com/opensearch-project/opensearch-go/v4" @@ -115,6 +116,16 @@ func (s *OpensearchInterfaceTestSuite) TearDownSuite() { } func (s *OpensearchInterfaceTestSuite) SetupTest() { + if strings.Contains(s.T().Name(), "CJK") { + s.th.App.UpdateConfig(func(cfg *model.Config) { + *cfg.ElasticsearchSettings.EnableCJKAnalyzers = true + }) + } else { + s.th.App.UpdateConfig(func(cfg *model.Config) { + *cfg.ElasticsearchSettings.EnableCJKAnalyzers = false + }) + } + s.CommonTestSuite.ESImpl = s.th.App.SearchEngine().ElasticsearchEngine if s.CommonTestSuite.ESImpl.IsActive() { diff --git a/server/i18n/en.json b/server/i18n/en.json index 8e90a137f93..9ed7897b96a 100644 --- a/server/i18n/en.json +++ b/server/i18n/en.json @@ -10052,6 +10052,10 @@ "id": "model.config.is_valid.elastic_search.enable_autocomplete.app_error", "translation": "{{.EnableIndexing}} setting must be set to true when {{.Autocomplete}} is set to true" }, + { + "id": "model.config.is_valid.elastic_search.enable_cjk_analyzers.app_error", + "translation": "{{.Searching}} setting must be set to true when {{.EnableCJKAnalyzers}} is set to true" + }, { "id": "model.config.is_valid.elastic_search.enable_searching.app_error", "translation": "{{.EnableIndexing}} setting must be set to true when {{.Searching}} is set to true" diff --git a/server/public/model/config.go b/server/public/model/config.go index 2a703d71e1c..b72730b4fec 100644 --- a/server/public/model/config.go +++ b/server/public/model/config.go @@ -3122,6 +3122,7 @@ type ElasticsearchSettings struct { Password *string `access:"environment_elasticsearch,write_restrictable,cloud_restrictable"` EnableIndexing *bool `access:"environment_elasticsearch,write_restrictable,cloud_restrictable"` EnableSearching *bool `access:"environment_elasticsearch,write_restrictable,cloud_restrictable"` + EnableCJKAnalyzers *bool `access:"environment_elasticsearch,write_restrictable,cloud_restrictable"` EnableAutocomplete *bool `access:"environment_elasticsearch,write_restrictable,cloud_restrictable"` Sniff *bool `access:"environment_elasticsearch,write_restrictable,cloud_restrictable"` PostIndexReplicas *int `access:"environment_elasticsearch,write_restrictable,cloud_restrictable"` @@ -3183,6 +3184,10 @@ func (s *ElasticsearchSettings) SetDefaults() { s.EnableSearching = NewPointer(false) } + if s.EnableCJKAnalyzers == nil { + s.EnableCJKAnalyzers = NewPointer(false) + } + if s.EnableAutocomplete == nil { s.EnableAutocomplete = NewPointer(false) } @@ -4728,6 +4733,13 @@ func (s *ElasticsearchSettings) isValid() *AppError { }, "", http.StatusBadRequest) } + if *s.EnableCJKAnalyzers && !*s.EnableSearching { + return NewAppError("Config.IsValid", "model.config.is_valid.elastic_search.enable_cjk_analyzers.app_error", map[string]any{ + "EnableCJKAnalyzers": "ElasticsearchSettings.EnableCJKAnalyzers", + "Searching": "ElasticsearchSettings.EnableSearching", + }, "", http.StatusBadRequest) + } + if *s.EnableAutocomplete && !*s.EnableIndexing { return NewAppError("Config.IsValid", "model.config.is_valid.elastic_search.enable_autocomplete.app_error", map[string]any{ "Autocomplete": "ElasticsearchSettings.EnableAutocomplete", diff --git a/webapp/platform/types/src/config.ts b/webapp/platform/types/src/config.ts index 019ef7719b7..7834f549cf2 100644 --- a/webapp/platform/types/src/config.ts +++ b/webapp/platform/types/src/config.ts @@ -888,6 +888,7 @@ export type ElasticsearchSettings = { Password: string; EnableIndexing: boolean; EnableSearching: boolean; + EnableCJKAnalyzers: boolean; EnableAutocomplete: boolean; Sniff: boolean; PostIndexReplicas: number;