Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

br: cli refactor backup error handling logic #54697

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions br/pkg/backup/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -1235,9 +1235,9 @@ func (bc *Client) OnBackupResponse(
return txnlock.NewLock(lockErr), nil
}
}
res := errContext.HandleIgnorableError(errPb, storeID)
res := utils.HandleBackupError(errPb, storeID, errContext)
switch res.Strategy {
case utils.GiveUpStrategy:
case utils.StrategyGiveUp:
errMsg := res.Reason
if len(errMsg) <= 0 {
errMsg = errPb.Msg
Expand Down
3 changes: 2 additions & 1 deletion br/pkg/utils/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,10 @@ go_library(
"db.go",
"dyn_pprof_other.go",
"dyn_pprof_unix.go",
"error_handling.go",
"json.go",
"key.go",
"misc.go",
"permission.go",
"pointer.go",
"pprof.go",
"progress.go",
Expand Down Expand Up @@ -68,6 +68,7 @@ go_test(
srcs = [
"backoff_test.go",
"db_test.go",
"error_handling_test.go",
"json_test.go",
"key_test.go",
"main_test.go",
Expand Down
4 changes: 2 additions & 2 deletions br/pkg/utils/backoff.go
Original file line number Diff line number Diff line change
Expand Up @@ -172,8 +172,8 @@ func (bo *importerBackoffer) NextBackoff(err error) time.Duration {
// we don't care storeID here.
errs := multierr.Errors(err)
lastErr := errs[len(errs)-1]
res := bo.errContext.HandleErrorMsg(lastErr.Error(), 0)
if res.Strategy == RetryStrategy {
res := HandleUnknownBackupError(lastErr.Error(), 0, bo.errContext)
if res.Strategy == StrategyRetry {
bo.delayTime = 2 * bo.delayTime
bo.attempt--
} else {
Expand Down
196 changes: 196 additions & 0 deletions br/pkg/utils/error_handling.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
// Copyright 2024 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http:https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package utils
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

new file should add license first, you can copy from other files.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah! good catch


import (
"fmt"
"strings"
"sync"

backuppb "github.com/pingcap/kvproto/pkg/brpb"
"github.com/pingcap/log"
"go.uber.org/zap"
)

// UNSAFE! TODO: remove and map them to error types
var retryableErrorMsg = []string{
"server closed",
"connection refused",
"connection reset by peer",
"channel closed",
"error trying to connect",
"connection closed before message completed",
"body write aborted",
"error during dispatch",
"put object timeout",
"timeout after",
"internalerror",
"not read from or written to within the timeout period",
"<code>requesttimeout</code>",
"<code>invalidpart</code>",
"end of file before message length reached",
}

// non-retryable error messages
// UNSAFE! TODO: remove and map them to error types
const (
ioMsg = "io"
notFoundMsg = "notfound"
permissionDeniedMsg = "permissiondenied"
)

// error messages
const (
unreachableRetryMsg = "unreachable retry"
retryOnKvErrorMsg = "retry on kv error"
retryOnRegionErrorMsg = "retry on region error"
clusterIdMismatchMsg = "cluster id mismatch"
unknownErrorMsg = "unknown error"
contextCancelledMsg = "context canceled"
retryOnUnknownErrorMsg = "unknown error, retry it for a few times"
noRetryOnUnknownErrorMsg = "unknown error, retried too many times, give up"
retryableStorageErrorMsg = "retryable storage error"
)

type ErrorHandlingResult struct {
Strategy ErrorHandlingStrategy
Reason string
}

type ErrorHandlingStrategy int

const (
// StrategyRetry error can be retried but will consume the backoff attempt quota.
StrategyRetry ErrorHandlingStrategy = iota
// StrategyGiveUp means unrecoverable error happened and the BR should exit
// for example:
// 1. permission not valid.
// 2. data not found.
// 3. retry too many times
StrategyGiveUp
// StrategyUnknown for StrategyUnknown error
StrategyUnknown
)

type ErrorContext struct {
mu sync.Mutex
// encounter times for one context on a store
// we may use this value to determine the retry policy
encounterTimes map[uint64]int
// unknown error retry limitation.
// encounter many times error makes Retry to GiveUp.
encounterTimesLimitation int
description string
}

func NewErrorContext(scenario string, limitation int) *ErrorContext {
return &ErrorContext{
description: scenario,
encounterTimes: make(map[uint64]int),
encounterTimesLimitation: limitation,
}
}

func NewDefaultContext() *ErrorContext {
return &ErrorContext{
description: "default",
encounterTimes: make(map[uint64]int),
encounterTimesLimitation: 1,
}
}

func HandleBackupError(err *backuppb.Error, storeId uint64, ec *ErrorContext) ErrorHandlingResult {
if err == nil {
return ErrorHandlingResult{StrategyRetry, unreachableRetryMsg}
}
res := handleBackupProtoError(err)
// try the best effort handle unknown error based on their error message
if res.Strategy == StrategyUnknown && len(err.Msg) != 0 {
return HandleUnknownBackupError(err.Msg, storeId, ec)
}
return res
}

func handleBackupProtoError(e *backuppb.Error) ErrorHandlingResult {
switch e.Detail.(type) {
case *backuppb.Error_KvError:
return ErrorHandlingResult{StrategyRetry, retryOnKvErrorMsg}
case *backuppb.Error_RegionError:
return ErrorHandlingResult{StrategyRetry, retryOnRegionErrorMsg}
case *backuppb.Error_ClusterIdError:
return ErrorHandlingResult{StrategyGiveUp, clusterIdMismatchMsg}
}
return ErrorHandlingResult{StrategyUnknown, unknownErrorMsg}
}

// HandleUnknownBackupError UNSAFE! TODO: remove this method and map all the current unknown errors to error types
func HandleUnknownBackupError(msg string, uuid uint64, ec *ErrorContext) ErrorHandlingResult {
// UNSAFE! TODO: use meaningful error code instead of unstructured message to find failed to write error.
logger := log.L().With(zap.String("description", ec.description))
if messageIsNotFoundStorageError(msg) {
reason := fmt.Sprintf("File or directory not found on TiKV Node (store id: %v). "+
"workaround: please ensure br and tikv nodes share a same storage and the user of br and tikv has same uid.",
uuid)
return ErrorHandlingResult{StrategyGiveUp, reason}
}
if messageIsPermissionDeniedStorageError(msg) {
reason := fmt.Sprintf("I/O permission denied error occurs on TiKV Node(store id: %v). "+
"workaround: please ensure tikv has permission to read from & write to the storage.",
uuid)
return ErrorHandlingResult{StrategyGiveUp, reason}
}
msgLower := strings.ToLower(msg)
if strings.Contains(msgLower, contextCancelledMsg) {
return ErrorHandlingResult{StrategyGiveUp, contextCancelledMsg}
}

if MessageIsRetryableStorageError(msg) {
logger.Warn(retryableStorageErrorMsg, zap.String("error", msg))
return ErrorHandlingResult{StrategyRetry, retryableStorageErrorMsg}
}

// retry enough on same store
ec.mu.Lock()
defer ec.mu.Unlock()
ec.encounterTimes[uuid]++
if ec.encounterTimes[uuid] <= ec.encounterTimesLimitation {
return ErrorHandlingResult{StrategyRetry, retryOnUnknownErrorMsg}
}
return ErrorHandlingResult{StrategyGiveUp, noRetryOnUnknownErrorMsg}
}

// messageIsNotFoundStorageError checks whether the message returning from TiKV is "NotFound" storage I/O error
func messageIsNotFoundStorageError(msg string) bool {
Copy link
Contributor Author

@Tristan1900 Tristan1900 Jul 17, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@3pointer does the storage here mean S3 or any backup destination? At first I thought it's TiKV internal but there is a proto type for it already so got a bit confused. Thanks

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

at this context storage means TiKV.

msgLower := strings.ToLower(msg)
return strings.Contains(msgLower, ioMsg) && strings.Contains(msgLower, notFoundMsg)
}

// MessageIsPermissionDeniedStorageError checks whether the message returning from TiKV is "PermissionDenied" storage I/O error
func messageIsPermissionDeniedStorageError(msg string) bool {
msgLower := strings.ToLower(msg)
return strings.Contains(msgLower, permissionDeniedMsg)
}

// MessageIsRetryableStorageError checks whether the message returning from TiKV is retryable ExternalStorageError.
func MessageIsRetryableStorageError(msg string) bool {
msgLower := strings.ToLower(msg)
// UNSAFE! TODO: Add a error type for retryable connection error.
for _, errStr := range retryableErrorMsg {
if strings.Contains(msgLower, errStr) {
return true
}
}
return false
}
70 changes: 70 additions & 0 deletions br/pkg/utils/error_handling_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package utils

import (
"testing"

backuppb "github.com/pingcap/kvproto/pkg/brpb"
"github.com/pingcap/kvproto/pkg/errorpb"
"github.com/stretchr/testify/require"
)

func TestHandleError(t *testing.T) {
ec := NewErrorContext("test", 3)
// Test case 1: Error is nil
result := HandleBackupError(nil, 123, ec)
require.Equal(t, ErrorHandlingResult{Strategy: StrategyRetry, Reason: unreachableRetryMsg}, result)

// Test case 2: Error is KvError and can be ignored
kvError := &backuppb.Error_KvError{}
result = HandleBackupError(&backuppb.Error{Detail: kvError}, 123, ec)
require.Equal(t, ErrorHandlingResult{Strategy: StrategyRetry, Reason: retryOnKvErrorMsg}, result)

// Test case 3: Error is RegionError and can be ignored
regionError := &backuppb.Error_RegionError{
RegionError: &errorpb.Error{NotLeader: &errorpb.NotLeader{RegionId: 1}}}
result = HandleBackupError(&backuppb.Error{Detail: regionError}, 123, ec)
require.Equal(t, ErrorHandlingResult{Strategy: StrategyRetry, Reason: retryOnRegionErrorMsg}, result)

// Test case 4: Error is ClusterIdError
clusterIdError := &backuppb.Error_ClusterIdError{}
result = HandleBackupError(&backuppb.Error{Detail: clusterIdError}, 123, ec)
require.Equal(t, ErrorHandlingResult{Strategy: StrategyGiveUp, Reason: clusterIdMismatchMsg}, result)
}

func TestHandleErrorMsg(t *testing.T) {
ec := NewErrorContext("test", 3)

// Test messageIsNotFoundStorageError
msg := "IO: files Notfound error"
uuid := uint64(456)
expectedReason := "File or directory not found on TiKV Node (store id: 456). workaround: please ensure br and tikv nodes share a same storage and the user of br and tikv has same uid."
expectedResult := ErrorHandlingResult{Strategy: StrategyGiveUp, Reason: expectedReason}
actualResult := HandleUnknownBackupError(msg, uuid, ec)
require.Equal(t, expectedResult, actualResult)

// Test messageIsPermissionDeniedStorageError
msg = "I/O permissiondenied error occurs on TiKV Node(store id: 456)."
expectedReason = "I/O permission denied error occurs on TiKV Node(store id: 456). workaround: please ensure tikv has permission to read from & write to the storage."
expectedResult = ErrorHandlingResult{Strategy: StrategyGiveUp, Reason: expectedReason}
actualResult = HandleUnknownBackupError(msg, uuid, ec)
require.Equal(t, expectedResult, actualResult)

// Test MessageIsRetryableStorageError
msg = "server closed"
expectedResult = ErrorHandlingResult{Strategy: StrategyRetry, Reason: retryableStorageErrorMsg}
actualResult = HandleUnknownBackupError(msg, uuid, ec)
require.Equal(t, expectedResult, actualResult)

// Test unknown error
msg = "unknown error"
expectedResult = ErrorHandlingResult{Strategy: StrategyRetry, Reason: retryOnUnknownErrorMsg}
actualResult = HandleUnknownBackupError(msg, uuid, ec)
require.Equal(t, expectedResult, actualResult)

// Test retry too many times
_ = HandleUnknownBackupError(msg, uuid, ec)
_ = HandleUnknownBackupError(msg, uuid, ec)
expectedResult = ErrorHandlingResult{Strategy: StrategyGiveUp, Reason: noRetryOnUnknownErrorMsg}
actualResult = HandleUnknownBackupError(msg, uuid, ec)
require.Equal(t, expectedResult, actualResult)
}
20 changes: 0 additions & 20 deletions br/pkg/utils/permission.go

This file was deleted.

Loading
Loading