mirror of
https://github.com/mattermost/mattermost.git
synced 2026-05-28 04:35:04 -04:00
* Add health flag to fast-fail when ES is offline
When Elasticsearch goes offline, the watcher takes up to 3 health
check cycles (~180 s) to detect the outage and stop the engine.
During that window every search query blocks for 30 s before
falling back to the database, and indexing goroutines pile up
unboundedly — causing server-wide slowness, posting failures,
and duplicate posts from client retries (MM-66612).
Introduce a `healthy` atomic flag on each ES/OpenSearch engine.
The watcher sets it to false on the *first* health-check failure
and back to true on success. `Broker.GetActiveEngines()` now
requires both `IsActive()` and `IsHealthy()`, so all search and
indexing operations skip the unhealthy engine immediately. The
existing 3-failure stop/restart cycle is unchanged and continues
to handle full recovery.
* Fix other tests
* Fix unrelated flaky test
* Use atomic.int32 everywhere
* Revert "Fix unrelated flaky test"
This reverts commit a289015637.
* Improve coverage for ActiveEngine/GetActiveEngines
* Document expectations on SearchEngineInterface
* Use mock.On("call").Unset
* Remove healthCalls to avoid a flaky test
* Be explicit on Unset
* Log any change in the health of the search engine
* Test the healthy<->unhealthy changes are logged
---------
Co-authored-by: Mattermost Build <build@mattermost.com>
75 lines
4.1 KiB
Go
75 lines
4.1 KiB
Go
// Copyright (c) 2015-present Mattermost, Inc. All Rights Reserved.
|
|
// See LICENSE.txt for license information.
|
|
|
|
package searchengine
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
|
|
"github.com/mattermost/mattermost/server/public/model"
|
|
"github.com/mattermost/mattermost/server/public/shared/request"
|
|
)
|
|
|
|
type SearchEngineInterface interface {
|
|
// Start initializes the engine connection. Implementations must set the
|
|
// initial health state (e.g. healthy) before returning, because the broker
|
|
// may call IsHealthy() immediately after Start() returns.
|
|
Start(ctx context.Context) *model.AppError
|
|
// Stop tears down the engine connection. Implementations must clear the
|
|
// health flag (i.e. set unhealthy) during Stop so that the broker does not
|
|
// route queries to a stopped engine.
|
|
Stop() *model.AppError
|
|
HealthCheck(rctx request.CTX) *model.AppError
|
|
GetFullVersion() string
|
|
GetVersion() int
|
|
GetPlugins() []string
|
|
UpdateConfig(cfg *model.Config)
|
|
GetName() string
|
|
// IsEnabled returns a boolean indicating whether the engine is enabled in the settings
|
|
IsEnabled() bool
|
|
IsActive() bool
|
|
// IsHealthy reports whether the engine is reachable. The initial value is
|
|
// set by the engine itself during Start() and cleared during Stop(). After
|
|
// startup, only the watcher drives transitions by calling SetHealthy(false)
|
|
// on the first health-check failure and SetHealthy(true) on success,
|
|
// allowing the broker to skip unhealthy engines immediately instead of
|
|
// waiting for full stop/restart.
|
|
IsHealthy() bool
|
|
// SetHealthy is called by the watcher to update the engine's health status
|
|
// based on health-check results. Implementations should not call this
|
|
// themselves — Start() and Stop() set the initial/final state directly.
|
|
SetHealthy(healthy bool)
|
|
IsIndexingEnabled() bool
|
|
IsSearchEnabled() bool
|
|
IsAutocompletionEnabled() bool
|
|
IsIndexingSync() bool
|
|
IndexPost(post *model.Post, teamId string, channelType string) *model.AppError
|
|
SearchPosts(channels model.ChannelList, searchParams []*model.SearchParams, page, perPage int) ([]string, model.PostSearchMatches, *model.AppError)
|
|
DeletePost(post *model.Post) *model.AppError
|
|
DeleteChannelPosts(rctx request.CTX, channelID string) *model.AppError
|
|
UpdatePostsChannelTypeByChannelId(rctx request.CTX, channelID string, channelType string) *model.AppError
|
|
BackfillPostsChannelType(rctx request.CTX, channelIDs []string, channelType string) *model.AppError
|
|
DeleteUserPosts(rctx request.CTX, userID string) *model.AppError
|
|
// IndexChannel indexes a given channel. The userIDs are only populated
|
|
// for private channels.
|
|
IndexChannel(rctx request.CTX, channel *model.Channel, userIDs, teamMemberIDs []string) *model.AppError
|
|
SyncBulkIndexChannels(rctx request.CTX, channels []*model.Channel, getUserIDsForChannel func(channel *model.Channel) ([]string, error), teamMemberIDs []string) *model.AppError
|
|
SearchChannels(teamId, userID, term string, isGuest, includeDeleted bool) ([]string, *model.AppError)
|
|
DeleteChannel(channel *model.Channel) *model.AppError
|
|
IndexUser(rctx request.CTX, user *model.User, teamsIds, channelsIds []string) *model.AppError
|
|
SearchUsersInChannel(teamId, channelId string, restrictedToChannels []string, term string, options *model.UserSearchOptions) ([]string, []string, *model.AppError)
|
|
SearchUsersInTeam(teamId string, restrictedToChannels []string, term string, options *model.UserSearchOptions) ([]string, *model.AppError)
|
|
DeleteUser(user *model.User) *model.AppError
|
|
IndexFile(file *model.FileInfo, channelId string) *model.AppError
|
|
SearchFiles(channels model.ChannelList, searchParams []*model.SearchParams, page, perPage int) ([]string, *model.AppError)
|
|
DeleteFile(fileID string) *model.AppError
|
|
DeletePostFiles(rctx request.CTX, postID string) *model.AppError
|
|
DeleteUserFiles(rctx request.CTX, userID string) *model.AppError
|
|
DeleteFilesBatch(rctx request.CTX, endTime, limit int64) *model.AppError
|
|
TestConfig(rctx request.CTX, cfg *model.Config) *model.AppError
|
|
PurgeIndexes(rctx request.CTX) *model.AppError
|
|
PurgeIndexList(rctx request.CTX, indexes []string) *model.AppError
|
|
RefreshIndexes(rctx request.CTX) *model.AppError
|
|
DataRetentionDeleteIndexes(rctx request.CTX, cutoff time.Time) *model.AppError
|
|
}
|