mattermost/server/enterprise/elasticsearch/common/common.go
Doug Lauder 3a2cc5e242
Some checks are pending
API / build (push) Waiting to run
Server CI / Compute Go Version (push) Waiting to run
Server CI / Check mocks (push) Blocked by required conditions
Server CI / Check go mod tidy (push) Blocked by required conditions
Server CI / check-style (push) Blocked by required conditions
Server CI / Check serialization methods for hot structs (push) Blocked by required conditions
Server CI / Vet API (push) Blocked by required conditions
Server CI / Check migration files (push) Blocked by required conditions
Server CI / Generate email templates (push) Blocked by required conditions
Server CI / Check store layers (push) Blocked by required conditions
Server CI / Check mmctl docs (push) Blocked by required conditions
Server CI / Postgres with binary parameters (push) Blocked by required conditions
Server CI / Postgres (push) Blocked by required conditions
Server CI / Postgres (FIPS) (push) Blocked by required conditions
Server CI / Generate Test Coverage (push) Blocked by required conditions
Server CI / Run mmctl tests (push) Blocked by required conditions
Server CI / Run mmctl tests (FIPS) (push) Blocked by required conditions
Server CI / Build mattermost server app (push) Blocked by required conditions
Web App CI / check-lint (push) Waiting to run
Web App CI / check-i18n (push) Blocked by required conditions
Web App CI / check-external-links (push) Blocked by required conditions
Web App CI / check-types (push) Blocked by required conditions
Web App CI / test (platform) (push) Blocked by required conditions
Web App CI / test (mattermost-redux) (push) Blocked by required conditions
Web App CI / test (channels shard 1/4) (push) Blocked by required conditions
Web App CI / test (channels shard 2/4) (push) Blocked by required conditions
Web App CI / test (channels shard 3/4) (push) Blocked by required conditions
Web App CI / test (channels shard 4/4) (push) Blocked by required conditions
Web App CI / upload-coverage (push) Blocked by required conditions
Web App CI / build (push) Blocked by required conditions
ES search: include footer and author name in indexed fields. (#35603)
* Include footer and author name in indexed fields.
2026-03-16 15:35:02 -04:00

447 lines
12 KiB
Go

// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
// See LICENSE.enterprise for license information.
package common
import (
"encoding/xml"
"fmt"
"io"
"net/url"
"regexp"
"runtime"
"strings"
"time"
"github.com/mattermost/mattermost/server/public/model"
"github.com/mattermost/mattermost/server/v8/platform/services/searchengine"
"github.com/mattermost/mattermost/server/v8/platform/shared/filestore"
)
const (
MaxLineLength = 10000
URLRegexpRE = `(\b|^)(?:https?:\/\/)?[a-zA-Z0-9-.]+\.[a-z]+(\s|\)?[a-zA-Z0-9\-._~:/?#\[\]@!$&'\(\)*\+,;=]*)(\b|$)`
URLMarkdownLinkRE = `(\[[^\]]+\]\([a-zA-Z0-9\-._~:/?#\[\]@!$&'\(\)*\+,;=]+\))`
EmailRE = `^[^\s"]+@[^\s"]+$`
IndexBasePosts = "posts"
IndexBasePosts_MONTH = IndexBasePosts + "month"
IndexBaseChannels = "channels"
IndexBaseUsers = "users"
IndexBaseFiles = "files"
// At the moment, this number is hardcoded. If needed, we can expose
// this to the config.
BulkFlushInterval = 5 * time.Second
// Size of the largest request to be done, in bytes
BulkFlushBytes = 10 * 1024 * 1024 // 10 MiB
)
type BulkSettings struct {
FlushBytes int
FlushInterval time.Duration
FlushNumReqs int
}
var (
urlRe = regexp.MustCompile(URLRegexpRE)
markdownLinkRe = regexp.MustCompile(URLMarkdownLinkRE)
)
type ESPost struct {
Id string `json:"id"`
TeamId string `json:"team_id"`
ChannelId string `json:"channel_id"`
UserId string `json:"user_id"`
CreateAt int64 `json:"create_at"`
Message string `json:"message"`
Type string `json:"type"`
Hashtags []string `json:"hashtags"`
Attachments string `json:"attachments"`
URLs []string `json:"urls"`
ChannelType string `json:"channel_type"`
}
type ESFile struct {
Id string `json:"id"`
CreatorId string `json:"creator_id"`
ChannelId string `json:"channel_id"`
PostId string `json:"post_id"`
CreateAt int64 `json:"create_at"`
Content string `json:"content"`
Extension string `json:"extension"`
Name string `json:"name"`
}
type ESChannel struct {
Id string `json:"id"`
Type model.ChannelType `json:"type"`
DeleteAt int64 `json:"delete_at"`
UserIDs []string `json:"user_ids"`
TeamId string `json:"team_id"`
TeamMemberIDs []string `json:"team_member_ids"`
NameSuggest []string `json:"name_suggestions"`
}
type ESUser struct {
Id string `json:"id"`
SuggestionsWithFullname []string `json:"suggestions_with_fullname"`
SuggestionsWithoutFullname []string `json:"suggestions_without_fullname"`
DeleteAt int64 `json:"delete_at"`
Roles []string `json:"roles"`
TeamsIds []string `json:"team_id"`
ChannelsIds []string `json:"channel_id"`
}
func ESPostFromPost(post *model.Post, teamId string, channelType string) (*ESPost, error) {
p := &model.PostForIndexing{
TeamId: teamId,
ChannelType: channelType,
}
err := post.ShallowCopy(&p.Post)
if err != nil {
return nil, err
}
return ESPostFromPostForIndexing(p), nil
}
func ESPostFromPostForIndexing(post *model.PostForIndexing) *ESPost {
searchPost := ESPost{
Id: post.Id,
TeamId: post.TeamId,
ChannelId: post.ChannelId,
UserId: post.UserId,
CreateAt: post.CreateAt,
Message: post.Message,
Type: post.Type,
Hashtags: strings.Fields(post.Hashtags),
ChannelType: post.ChannelType,
}
var searchAttachments []string
if attachments := post.GetProp(model.PostPropsAttachments); attachments != nil {
attachmentsInterfaceArray, ok := attachments.([]any)
if ok {
for _, attachment := range attachmentsInterfaceArray {
if attachment == nil {
continue
}
m, mOk := attachment.(map[string]any)
if !mOk {
continue
}
for _, key := range []string{"text", "title", "pretext", "fallback", "footer", "author_name"} {
if s, _ := m[key].(string); s != "" {
searchAttachments = append(searchAttachments, s)
}
}
if fields, fOk := m["fields"].([]any); fOk {
for _, f := range fields {
if fm, fmOk := f.(map[string]any); fmOk {
if s, _ := fm["title"].(string); s != "" {
searchAttachments = append(searchAttachments, s)
}
if v := fm["value"]; v != nil {
if s, vOk := v.(string); vOk {
if s != "" {
searchAttachments = append(searchAttachments, s)
}
} else {
searchAttachments = append(searchAttachments, fmt.Sprint(v))
}
}
}
}
}
}
}
attachmentsArray, ok := attachments.([]*model.MessageAttachment)
if ok {
for _, attachment := range attachmentsArray {
if attachment == nil {
continue
}
for _, s := range []string{attachment.Text, attachment.Title, attachment.Pretext, attachment.Fallback, attachment.Footer, attachment.AuthorName} {
if s != "" {
searchAttachments = append(searchAttachments, s)
}
}
for _, field := range attachment.Fields {
if field == nil {
continue
}
if field.Title != "" {
searchAttachments = append(searchAttachments, field.Title)
}
if field.Value != nil {
if s, ok := field.Value.(string); ok {
if s != "" {
searchAttachments = append(searchAttachments, s)
}
} else {
searchAttachments = append(searchAttachments, fmt.Sprint(field.Value))
}
}
}
}
}
}
searchPost.Attachments = strings.Join(searchAttachments, " ")
urls := extractURLsFromMessage(post.Message)
if len(urls) > 0 {
searchPost.URLs = urls
}
if searchPost.Type == "" {
searchPost.Type = "default"
}
return &searchPost
}
func extractURLsFromMessage(message string) []string {
message = markdownLinkRe.ReplaceAllString(message, "")
urls := urlRe.FindAllString(message, -1)
filteredURLs := make([]string, 0)
for _, u := range urls {
u = strings.TrimSpace(u)
urlToCheck := u
if !strings.HasPrefix(u, "http://") && !strings.HasPrefix(u, "https://") {
urlToCheck = "http://" + u
}
parsedURL, err := url.Parse(urlToCheck)
if err != nil || parsedURL.Scheme == "" || parsedURL.Host == "" {
continue
}
filteredURLs = append(filteredURLs, u)
}
return filteredURLs
}
func splitFilenameWords(name string) string {
result := name
result = strings.ReplaceAll(result, "-", " ")
result = strings.ReplaceAll(result, ".", " ")
return result
}
func ESFileFromFileInfo(file *model.FileInfo, channelId string) *ESFile {
return &ESFile{
Id: file.Id,
CreatorId: file.CreatorId,
ChannelId: channelId,
PostId: file.PostId,
CreateAt: file.CreateAt,
Content: file.Content,
Extension: file.Extension,
Name: file.Name + " " + splitFilenameWords(file.Name),
}
}
func ESFileFromFileForIndexing(file *model.FileForIndexing) *ESFile {
return &ESFile{
Id: file.Id,
CreatorId: file.CreatorId,
ChannelId: file.ChannelId,
PostId: file.PostId,
CreateAt: file.CreateAt,
Content: file.Content,
Extension: file.Extension,
Name: file.Name + " " + splitFilenameWords(file.Name),
}
}
func ESChannelFromChannel(channel *model.Channel, userIDs, teamMemberIDs []string) *ESChannel {
displayNameInputs := searchengine.GetSuggestionInputsSplitBy(channel.DisplayName, " ")
nameInputs := searchengine.GetSuggestionInputsSplitByMultiple(channel.Name, []string{"-", "_"})
return &ESChannel{
Id: channel.Id,
Type: channel.Type,
DeleteAt: channel.DeleteAt,
UserIDs: userIDs,
TeamId: channel.TeamId,
TeamMemberIDs: teamMemberIDs,
NameSuggest: append(displayNameInputs, nameInputs...),
}
}
func ESUserFromUserAndTeams(user *model.User, teamsIds, channelsIds []string) *ESUser {
usernameSuggestions := searchengine.GetSuggestionInputsSplitByMultiple(user.Username, []string{".", "-", "_"})
fullnameStrings := []string{}
if user.FirstName != "" {
fullnameStrings = append(fullnameStrings, user.FirstName)
}
if user.LastName != "" {
fullnameStrings = append(fullnameStrings, user.LastName)
}
fullnameSuggestions := []string{}
if len(fullnameStrings) > 0 {
fullname := strings.Join(fullnameStrings, " ")
fullnameSuggestions = searchengine.GetSuggestionInputsSplitBy(fullname, " ")
}
nicknameSuggestions := []string{}
if user.Nickname != "" {
nicknameSuggestions = searchengine.GetSuggestionInputsSplitBy(user.Nickname, " ")
}
usernameAndNicknameSuggestions := append(usernameSuggestions, nicknameSuggestions...)
return &ESUser{
Id: user.Id,
SuggestionsWithFullname: append(usernameAndNicknameSuggestions, fullnameSuggestions...),
SuggestionsWithoutFullname: usernameAndNicknameSuggestions,
DeleteAt: user.DeleteAt,
Roles: user.GetRoles(),
TeamsIds: teamsIds,
ChannelsIds: channelsIds,
}
}
func ESUserFromUserForIndexing(userForIndexing *model.UserForIndexing) *ESUser {
user := &model.User{
Id: userForIndexing.Id,
Username: userForIndexing.Username,
Nickname: userForIndexing.Nickname,
FirstName: userForIndexing.FirstName,
Roles: userForIndexing.Roles,
LastName: userForIndexing.LastName,
CreateAt: userForIndexing.CreateAt,
DeleteAt: userForIndexing.DeleteAt,
}
return ESUserFromUserAndTeams(user, userForIndexing.TeamsIds, userForIndexing.ChannelsIds)
}
// SearchIndexName returns the index pattern to search for a given index name.
func SearchIndexName(settings model.ElasticsearchSettings, name string) string {
if *settings.GlobalSearchPrefix == "" {
return *settings.IndexPrefix + name
}
// GlobalSearchPrefix is a prefix of IndexPrefix itself. This is verified in the config.
// Therefore, we use * to search across all indices with the common search prefix.
return *settings.GlobalSearchPrefix + "*" + name
}
func BuildPostIndexName(aggregateAfterDays int, unaggregatedBase string, aggregatedBase string, now time.Time, createAt int64) string {
postTime := time.Unix(createAt/1000, 0)
aggregateCutoffTime := time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, time.Local).AddDate(0, 0, -aggregateAfterDays+1)
if postTime.Before(aggregateCutoffTime) {
return fmt.Sprintf("%v_%d_%02d", aggregatedBase, postTime.Year(), postTime.Month())
}
return fmt.Sprintf("%v_%d_%02d_%02d", unaggregatedBase, postTime.Year(), postTime.Month(), postTime.Day())
}
func NumIndexWorkers() int {
const maxCPU = 4
if runtime.NumCPU() > maxCPU {
return maxCPU
}
return runtime.NumCPU()
}
// maxCertFileSizeBytes is an internal constant
// used to limit file size of ClientCert, ClientKey and CA.
const maxCertFileSizeBytes = 1_000_000 // 1MB
func ReadFileSafely(fb filestore.FileBackend, path string) ([]byte, error) {
rd, err := fb.Reader(path)
if err != nil {
return nil, err
}
defer rd.Close()
type resp struct {
buf []byte
err error
}
ch := make(chan resp)
go func() {
buf, err := io.ReadAll(io.LimitReader(rd, maxCertFileSizeBytes))
ch <- resp{buf, err}
}()
select {
case got := <-ch:
return got.buf, got.err
case <-time.After(10 * time.Second): // Adding a timeout for the file read.
return nil, fmt.Errorf("timed out while reading file: %s", path)
}
}
func GetMatchesForHit(highlights map[string][]string) ([]string, error) {
matchMap := make(map[string]bool)
parseMatches := func(snippets []string) error {
// Highlighted matches are returned as an array of snippets of the post where
// each snippet has the highlighted text surrounded by html <em> tags
for _, snippet := range snippets {
decoder := xml.NewDecoder(strings.NewReader(snippet))
inMatch := false
for {
token, err := decoder.Token()
if err == io.EOF {
break
} else if err != nil {
return err
}
switch typed := token.(type) {
case xml.StartElement:
if typed.Name.Local == "em" {
inMatch = true
}
case xml.EndElement:
if typed.Name.Local == "em" {
inMatch = false
}
case xml.CharData:
if inMatch && len(typed) != 0 {
match := string(typed)
match = strings.Trim(match, "_*~")
matchMap[match] = true
}
}
}
}
return nil
}
if err := parseMatches(highlights["message"]); err != nil {
return nil, err
}
if err := parseMatches(highlights["attachments"]); err != nil {
return nil, err
}
if err := parseMatches(highlights["urls"]); err != nil {
return nil, err
}
if err := parseMatches(highlights["hashtags"]); err != nil {
return nil, err
}
var matches []string
for match := range matchMap {
matches = append(matches, match)
}
return matches, nil
}