From 95be6eaf86cecb851f2c0eee0df98487dec5884c Mon Sep 17 00:00:00 2001 From: JG Heithcock Date: Sun, 26 Apr 2026 14:07:07 -0700 Subject: [PATCH] [MM-63434] Use forked PDF library with parsing depth limit (cherry-pick #35947) (#36177) * [MM-63434] Use forked PDF library with parsing depth limit (#35947) * [MM-63434] Use forked PDF library with parsing depth limit Replace github.com/ledongthuc/pdf with a fork that limits object nesting depth during parsing. Add test coverage. * Reverting incorrect merge that lost the change to msgpack The error was in merge https://github.com/mattermost/mattermost/commit/64bdff88d853e030d6e0f243cf21370a78dde6ae * Remove stale ledongthuc/pdf checksums after fork replace (go mod tidy) * Fix TestPdfFile expected text for forked PDF extractor (release-10.11) The jgheithcock/pdf fork returns a leading newline for sample-doc.pdf; align with master and 11.x cherry-picks. --- server/go.mod | 7 +++---- server/go.sum | 4 ++-- .../platform/services/docextractor/pdf_test.go | 17 ++++++++++++++++- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/server/go.mod b/server/go.mod index 3e318154dec..f413b74a22c 100644 --- a/server/go.mod +++ b/server/go.mod @@ -248,10 +248,6 @@ exclude ( github.com/willf/bitset v1.2.0 ) -// Prevent from being upgraded because this library has a minimum requirement -// of Go 1.24. -replace github.com/ledongthuc/pdf => github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06 - // Also prevent tablewriter from being upgraded because the downstream dependency // jaytaylor/html2text does not have a go.mod file which makes it bump to the latest // version always. Tablewriter has made breaking changes to its latest release. @@ -259,3 +255,6 @@ replace github.com/olekukonko/tablewriter => github.com/olekukonko/tablewriter v // See MM-66167, MM-68222 for more details. replace github.com/vmihailenco/msgpack/v5 => github.com/mattermost/msgpack/v5 v5.0.0-20260408165622-cadfad56a815 + +// See MM-63434 for more details. +replace github.com/ledongthuc/pdf => github.com/jgheithcock/pdf v0.0.0-20260404175814-28cd6530c1fe diff --git a/server/go.sum b/server/go.sum index 3530dc2c45f..46eb3073715 100644 --- a/server/go.sum +++ b/server/go.sum @@ -352,6 +352,8 @@ github.com/jaytaylor/html2text v0.0.0-20180606194806-57d518f124b0/go.mod h1:CVKl github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056 h1:iCHtR9CQyktQ5+f3dMVZfwD2KWJUgm7M0gdL9NGr8KA= github.com/jaytaylor/html2text v0.0.0-20230321000545-74c2419ad056/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk= github.com/jellevandenhooff/dkim v0.0.0-20150330215556-f50fe3d243e1/go.mod h1:E0B/fFc00Y+Rasa88328GlI/XbtyysCtTHZS8h7IrBU= +github.com/jgheithcock/pdf v0.0.0-20260404175814-28cd6530c1fe h1:9GAP+hdboArdSUwi82IXaNd+Qq8+cGFQh7xAcwZNN+s= +github.com/jgheithcock/pdf v0.0.0-20260404175814-28cd6530c1fe/go.mod h1:1fEHWurg7pvf5SG6XNE5Q8UZmOwex51Mkx3SLhrW5B4= github.com/jhump/protoreflect v1.15.1 h1:HUMERORf3I3ZdX05WaQ6MIpd/NJ434hTp5YiKgfCL6c= github.com/jhump/protoreflect v1.15.1/go.mod h1:jD/2GMKKE6OqX8qTjhADU1e6DShO+gavG9e0Q693nKo= github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg= @@ -403,8 +405,6 @@ github.com/lann/builder v0.0.0-20180802200727-47ae307949d0 h1:SOEGU9fKiNWd/HOJuq github.com/lann/builder v0.0.0-20180802200727-47ae307949d0/go.mod h1:dXGbAdH5GtBTC4WfIxhKZfyBF/HBFgRZSWwZ9g/He9o= github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0 h1:P6pPBnrTSX3DEVR4fDembhRWSsG5rVo6hYhAB/ADZrk= github.com/lann/ps v0.0.0-20150810152359-62de8c46ede0/go.mod h1:vmVJ0l/dxyfGW6FmdpVm2joNMFikkuWg0EoCKLGUMNw= -github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06 h1:kacRlPN7EN++tVpGUorNGPn/4DnB7/DfTY82AOn6ccU= -github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06/go.mod h1:imJHygn/1yfhB7XSJJKlFZKl/J+dCPAknuiaGOshXAs= github.com/levigross/exp-html v0.0.0-20120902181939-8df60c69a8f5 h1:W7p+m/AECTL3s/YR5RpQ4hz5SjNeKzZBl1q36ws12s0= github.com/levigross/exp-html v0.0.0-20120902181939-8df60c69a8f5/go.mod h1:QMe2wuKJ0o7zIVE8AqiT8rd8epmm6WDIZ2wyuBqYPzM= github.com/lib/pq v1.10.9 h1:YXG7RB+JIjhP29X+OtkiDnYaXQwpS4JEWq7dtCCRUEw= diff --git a/server/platform/services/docextractor/pdf_test.go b/server/platform/services/docextractor/pdf_test.go index c3a993d4c55..722cbf64896 100644 --- a/server/platform/services/docextractor/pdf_test.go +++ b/server/platform/services/docextractor/pdf_test.go @@ -20,7 +20,7 @@ func TestPdfEmptyFile(t *testing.T) { func TestPdfFile(t *testing.T) { extractor := pdfExtractor{} - contentText := "This is a simple document that contains some text." + contentText := "\nThis is a simple document that contains some text." content, err := testutils.ReadTestFile("sample-doc.pdf") require.NoError(t, err) extractedText, err := extractor.Extract("sample-doc.pdf", bytes.NewReader(content), 0) @@ -28,6 +28,21 @@ func TestPdfFile(t *testing.T) { require.Equal(t, contentText, extractedText) } +func TestPdfDeeplyNestedObjects(t *testing.T) { + // Test for MM-63434 + var buf bytes.Buffer + buf.WriteString("%PDF-1.0\n") + for range 10_000 { + buf.WriteString("0\n0\nobj\n") + } + buf.WriteString("startxref\n0\n%%EOF\n") + + extractor := pdfExtractor{} + text, err := extractor.Extract("excessive-nests.pdf", bytes.NewReader(buf.Bytes()), 0) + require.Error(t, err) + require.Empty(t, text) +} + func TestWrongPdfFile(t *testing.T) { extractor := pdfExtractor{} content, err := testutils.ReadTestFile("sample-doc.docx")