[MM-63434] Use forked PDF library with parsing depth limit

Replace github.com/ledongthuc/pdf with a fork that limits object
nesting depth during parsing. Add test coverage.
This commit is contained in:
JG Heithcock 2026-04-04 11:18:31 -07:00
parent 24e38f2bd7
commit 695e71c531
No known key found for this signature in database
GPG key ID: AE62550810974334
3 changed files with 20 additions and 0 deletions

View file

@ -230,3 +230,6 @@ require (
// See MM-66167 for more details.
replace github.com/vmihailenco/msgpack/v5 => github.com/mattermost/msgpack/v5 v5.0.0-20260120151306-2f9c67d7e57f
// See MM-63434 for more details.
replace github.com/ledongthuc/pdf => github.com/jgheithcock/pdf v0.0.0-20260404175814-28cd6530c1fe

View file

@ -310,6 +310,8 @@ github.com/jaytaylor/html2text v0.0.0-20180606194806-57d518f124b0/go.mod h1:CVKl
github.com/jaytaylor/html2text v0.0.0-20260303211410-1a4bdc82ecec h1:DrV+GDNKHeHyfqEZaoxQoHlWcgTBiaJ8ZUyNyd5vvkY=
github.com/jaytaylor/html2text v0.0.0-20260303211410-1a4bdc82ecec/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk=
github.com/jellevandenhooff/dkim v0.0.0-20150330215556-f50fe3d243e1/go.mod h1:E0B/fFc00Y+Rasa88328GlI/XbtyysCtTHZS8h7IrBU=
github.com/jgheithcock/pdf v0.0.0-20260404175814-28cd6530c1fe h1:9GAP+hdboArdSUwi82IXaNd+Qq8+cGFQh7xAcwZNN+s=
github.com/jgheithcock/pdf v0.0.0-20260404175814-28cd6530c1fe/go.mod h1:1fEHWurg7pvf5SG6XNE5Q8UZmOwex51Mkx3SLhrW5B4=
github.com/jhump/protoreflect v1.17.0 h1:qOEr613fac2lOuTgWN4tPAtLL7fUSbuJL5X5XumQh94=
github.com/jhump/protoreflect v1.17.0/go.mod h1:h9+vUUL38jiBzck8ck+6G/aeMX8Z4QUY/NiJPwPNi+8=
github.com/jmoiron/sqlx v1.4.0 h1:1PLqN7S1UYp5t4SrVVnt4nUVNemrDAtxlulVe+Qgm3o=

View file

@ -28,6 +28,21 @@ func TestPdfFile(t *testing.T) {
require.Equal(t, contentText, extractedText)
}
func TestPdfDeeplyNestedObjects(t *testing.T) {
// Test for MM-63434
var buf bytes.Buffer
buf.WriteString("%PDF-1.0\n")
for range 10_000 {
buf.WriteString("0\n0\nobj\n")
}
buf.WriteString("startxref\n0\n%%EOF\n")
extractor := pdfExtractor{}
text, err := extractor.Extract("excessive-nests.pdf", bytes.NewReader(buf.Bytes()), 0)
require.Error(t, err)
require.Empty(t, text)
}
func TestWrongPdfFile(t *testing.T) {
extractor := pdfExtractor{}
content, err := testutils.ReadTestFile("sample-doc.docx")