[MM-63434] Use forked PDF library with parsing depth limit (cherry-pick #35947) (#36177)

* [MM-63434] Use forked PDF library with parsing depth limit (#35947)

* [MM-63434] Use forked PDF library with parsing depth limit

Replace github.com/ledongthuc/pdf with a fork that limits object
nesting depth during parsing. Add test coverage.

* Reverting incorrect merge that lost the change to msgpack

The error was in merge 64bdff88d8

* Remove stale ledongthuc/pdf checksums after fork replace (go mod tidy)

* Fix TestPdfFile expected text for forked PDF extractor (release-10.11)

The jgheithcock/pdf fork returns a leading newline for sample-doc.pdf; align
with master and 11.x cherry-picks.
This commit is contained in:
JG Heithcock 2026-04-26 14:07:07 -07:00 committed by GitHub
parent 61d68d2d6e
commit 95be6eaf86
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 21 additions and 7 deletions

View file

@ -248,10 +248,6 @@ exclude (
github.com/willf/bitset v1.2.0
)
// Prevent from being upgraded because this library has a minimum requirement
// of Go 1.24.
replace github.com/ledongthuc/pdf => github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06
// Also prevent tablewriter from being upgraded because the downstream dependency
// jaytaylor/html2text does not have a go.mod file which makes it bump to the latest
// version always. Tablewriter has made breaking changes to its latest release.
@ -259,3 +255,6 @@ replace github.com/olekukonko/tablewriter => github.com/olekukonko/tablewriter v
// See MM-66167, MM-68222 for more details.
replace github.com/vmihailenco/msgpack/v5 => github.com/mattermost/msgpack/v5 v5.0.0-20260408165622-cadfad56a815
// See MM-63434 for more details.
replace github.com/ledongthuc/pdf => github.com/jgheithcock/pdf v0.0.0-20260404175814-28cd6530c1fe

View file

@ -352,6 +352,8 @@ github.com/jaytaylor/html2text v0.0.0-20180606194806-57d518f124b0/go.mod h1:CVKl
github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 h1:iCHtR9CQyktQ5+f3dMVZfwD2KWJUgm7M0gdL9NGr8KA=
github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk=
github.com/jellevandenhooff/dkim v0.0.0-20150330215556-f50fe3d243e1/go.mod h1:E0B/fFc00Y+Rasa88328GlI/XbtyysCtTHZS8h7IrBU=
github.com/jgheithcock/pdf v0.0.0-20260404175814-28cd6530c1fe h1:9GAP+hdboArdSUwi82IXaNd+Qq8+cGFQh7xAcwZNN+s=
github.com/jgheithcock/pdf v0.0.0-20260404175814-28cd6530c1fe/go.mod h1:1fEHWurg7pvf5SG6XNE5Q8UZmOwex51Mkx3SLhrW5B4=
github.com/jhump/protoreflect v1.15.1 h1:HUMERORf3I3ZdX05WaQ6MIpd/NJ434hTp5YiKgfCL6c=
github.com/jhump/protoreflect v1.15.1/go.mod h1:jD/2GMKKE6OqX8qTjhADU1e6DShO+gavG9e0Q693nKo=
github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
@ -403,8 +405,6 @@ github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 h1:SOEGU9fKiNWd/HOJuq
github.com/lann/builder v0.0.0-20180802200727-47ae307949d0/go.mod h1:dXGbAdH5GtBTC4WfIxhKZfyBF/HBFgRZSWwZ9g/He9o=
github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 h1:P6pPBnrTSX3DEVR4fDembhRWSsG5rVo6hYhAB/ADZrk=
github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0/go.mod h1:vmVJ0l/dxyfGW6FmdpVm2joNMFikkuWg0EoCKLGUMNw=
github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06 h1:kacRlPN7EN++tVpGUorNGPn/4DnB7/DfTY82AOn6ccU=
github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs=
github.com/levigross/exp-html v0.0.0-20120902181939-8df60c69a8f5 h1:W7p+m/AECTL3s/YR5RpQ4hz5SjNeKzZBl1q36ws12s0=
github.com/levigross/exp-html v0.0.0-20120902181939-8df60c69a8f5/go.mod h1:QMe2wuKJ0o7zIVE8AqiT8rd8epmm6WDIZ2wyuBqYPzM=
github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw=

View file

@ -20,7 +20,7 @@ func TestPdfEmptyFile(t *testing.T) {
func TestPdfFile(t *testing.T) {
extractor := pdfExtractor{}
contentText := "This is a simple document that contains some text."
contentText := "\nThis is a simple document that contains some text."
content, err := testutils.ReadTestFile("sample-doc.pdf")
require.NoError(t, err)
extractedText, err := extractor.Extract("sample-doc.pdf", bytes.NewReader(content), 0)
@ -28,6 +28,21 @@ func TestPdfFile(t *testing.T) {
require.Equal(t, contentText, extractedText)
}
func TestPdfDeeplyNestedObjects(t *testing.T) {
// Test for MM-63434
var buf bytes.Buffer
buf.WriteString("%PDF-1.0\n")
for range 10_000 {
buf.WriteString("0\n0\nobj\n")
}
buf.WriteString("startxref\n0\n%%EOF\n")
extractor := pdfExtractor{}
text, err := extractor.Extract("excessive-nests.pdf", bytes.NewReader(buf.Bytes()), 0)
require.Error(t, err)
require.Empty(t, text)
}
func TestWrongPdfFile(t *testing.T) {
extractor := pdfExtractor{}
content, err := testutils.ReadTestFile("sample-doc.docx")