diff --git a/internal/command/format/filter_control.go b/internal/command/format/filter_control.go new file mode 100644 index 0000000000..8756e72662 --- /dev/null +++ b/internal/command/format/filter_control.go @@ -0,0 +1,97 @@ +// Copyright (c) The OpenTofu Authors +// SPDX-License-Identifier: MPL-2.0 +// Copyright (c) 2023 HashiCorp, Inc. +// SPDX-License-Identifier: MPL-2.0 + +package format + +import ( + "strings" +) + +// unicodeControlPicturesStart is the codepoint of the first character in the +// Unicode "Control Pictures" block. +// +// The first 32 codepoints in this block correlate with the control characters +// in the first 32 codepoints of the "Basic Latin" block, so a control character +// codepoint can be translated into its corresponding control picture codepoint +// by adding this constant. +const unicodeControlPicturesStart = rune(0x2400) + +const del = rune(0x7f) +const delPicture = rune(0x2421) + +// FilterControlChars translates 7-bit C0 control characters in the given string +// (character codes less than 32) into their corresponding symbols from the +// Unicode "Control Pictures" block, so that the result can be printed to a +// terminal-like device without affecting the terminal's state machine. +// +// As an exception this does not change control characters that commonly appear +// as part of human-oriented text: newline (0x0a), carriage return (0x0d), +// and horizontal tab (0x09). +// +// We use this when including untrusted data as part of "human-friendly" +// output. We use the Unicode control pictures so that a human reader can +// (with a suitably-equipped terminal font) still identify which specific +// control character appeared, in case that is helpful for debugging, and +// because they are relatively unlikely to appear literally in a string we're +// rendering in the UI. +// +// This is only for arbitrary text strings rendered directly in the UI, +// such as the message portions of rendered diagnostics. We need not use this +// when producing machine-readable output such as JSON representations, or when +// showing a string in a quoted notation that mimics either the HCL or Go string +// syntax, because the control characters are already backslash-escaped by the +// quoting process in those cases. We also don't need to use this for strings +// that are known to contain valid HCL identifiers, because the control +// characters are not valid for use in HCL's identifier tokens. +func FilterControlChars(input string) string { + // In the common case there are no relevant control characters at all, so + // we'll first scan the string to see if we can return the input verbatim + // and thus avoid allocating a new copy of that string. + if !strings.ContainsFunc(input, isFilteredControlChar) { + return input + } + + // If we get here then we definitely need to build a new string. + var buf strings.Builder + // We'll give ourselves capacity for replacing up to two control characters + // with their "Control Pictures' equivalents, which (due to UTF-8 encoding) + // causes each 1-byte control character to be replaced by a 3-byte sequence. + // If we find more than two control characters then the buffer may + // reallocate (automatically) to get extra capacity. + buf.Grow(len(input) + 4) + for _, r := range input { + if !isFilteredControlChar(r) { + // Writing to a [strings.Builder] never encounters an error. + _, _ = buf.WriteRune(r) + continue + } + // If we get here then seq is definitely an ineligible C0 control + // character, so we need to transform it into the 3-byte encoding of the + // corresponding Control Picture codepoint. + // Writing to a [strings.Builder] never encounters an error. + _, _ = buf.WriteRune(controlPicture(r)) + } + return buf.String() +} + +// isFilteredControlChar returns true if and only if the given rune is in the +// range of 7-bit C0 control characters. +func isFilteredControlChar(r rune) bool { + // Space (0x20) is the first non-control character + return (r < ' ' && r != '\r' && r != '\n' && r != '\t') || r == del +} + +// controlPicture returns the control picture equivalent of the given C0 control +// character, or returns the given character verbatim if it is not actually +// a C0 control character. +func controlPicture(ctrl rune) rune { + if ctrl < ' ' { + return ctrl + unicodeControlPicturesStart + } + if ctrl == del { + return delPicture + } + return ctrl +} diff --git a/internal/command/format/filter_control_test.go b/internal/command/format/filter_control_test.go new file mode 100644 index 0000000000..a0bc7ecbff --- /dev/null +++ b/internal/command/format/filter_control_test.go @@ -0,0 +1,43 @@ +// Copyright (c) The OpenTofu Authors +// SPDX-License-Identifier: MPL-2.0 +// Copyright (c) 2023 HashiCorp, Inc. +// SPDX-License-Identifier: MPL-2.0 + +package format + +import ( + "fmt" + "testing" +) + +func TestFilterControlChars(t *testing.T) { + tests := map[string]string{ + "Hello, world!": "Hello, world!", + "Hello\nworld!": "Hello\nworld!", + "Hello\rworld!": "Hello\rworld!", + "Hello\r\nworld!": "Hello\r\nworld!", + "Hello world\x00": "Hello world␀", + + // Filter various ways that someone might try to hide or replace earlier + // output from OpenTofu. + "Hello\x7f\x7f\x7f\x7f\x7fGoodbye, world!": "Hello␡␡␡␡␡Goodbye, world!", + "Hello\x08\x08\x08\x08\x08Goodbye, world!": "Hello␈␈␈␈␈Goodbye, world!", + "\x1b[1m": "␛[1m", // "Set Graphic Rendition" (SGR) control sequence + "\x1bM": "␛M", // "Reverse Index" (RI) control sequence (moves cursor up, so subsequent text could overwrite earlier text) + + // The cases above ensure that we handle some relatively-likely + // combinations in a sensible way, but we'll also just exhaustively + // test all of them together to make sure they all get handled in + // a reasonable way. + "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f \x7f": "␀␁␂␃␄␅␆␇␈\t\n␋␌\r␎␏␐␑␒␓␔␕␖␗␘␙␚␛␜␝␞␟ ␡", + } + + for input, want := range tests { + t.Run(fmt.Sprintf("%q", input), func(t *testing.T) { + got := FilterControlChars(input) + if got != want { + t.Errorf("wrong result\ninput: %q\ngot: %q\nwant: %q", input, got, want) + } + }) + } +}