diff --git a/page.go b/page.go index 37b1adc..9647357 100644 --- a/page.go +++ b/page.go @@ -752,17 +752,46 @@ func (p Page) walkTextBlocks(walker func(enc TextEncoding, x, y float64, s strin } }) } - +// // Content returns the page's content. +// +// bugfix: +// the /Content may contain an array of refs +// this leads to an endless loop +// func (p Page) Content() Content { + + var text []Text + var rect []Rect + + //fmt.Println("page=",p) strm := p.V.Key("Contents") + + if strm.Len() == 0 { + c := p.readContent(strm) + text = c.Text + rect = c.Rect + } else { + for i := 0; i < strm.Len(); i++ { + strmindex := strm.Index(i) + //fmt.Println("stream ",i,"=",strmindex) + + c := p.readContent(strmindex) + text = append(text, c.Text...) + rect = append(rect, c.Rect...) + } + } + return Content{text, rect} +} + +func (p Page) readContent(strm Value) Content { var enc TextEncoding = &nopEncoder{} var g = gstate{ Th: 1, CTM: ident, } - + var text []Text showText := func(s string) { n := 0 @@ -843,9 +872,10 @@ func (p Page) Content() Content { case "Q": // restore graphics state n := len(gstack) - 1 - g = gstack[n] - gstack = gstack[:n] - + if n >= 0 { // bugfix: don't raise an exception + g = gstack[n] + gstack = gstack[:n] + } case "BT": // begin text (reset text matrix and line matrix) g.Tm = ident g.Tlm = g.Tm @@ -914,23 +944,25 @@ func (p Page) Content() Content { showText(args[0].RawString()) case "TJ": // show text, allowing individual glyph positioning - v := args[0] - for i := 0; i < v.Len(); i++ { - x := v.Index(i) - if x.Kind() == String { - if i == v.Len()-1 { - showText(x.RawString()) - op = "BT" - continue + if len(args) > 0 { // bugfix: don't raise an exception + v := args[0] + for i := 0; i < v.Len(); i++ { + x := v.Index(i) + if x.Kind() == String { + if i == v.Len()-1 { + showText(x.RawString()) + op = "BT" + continue + } else { + showText(x.RawString()) + } } else { - showText(x.RawString()) + tx := -x.Float64() / 1000 * g.Tfs * g.Th + g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm) } - } else { - tx := -x.Float64() / 1000 * g.Tfs * g.Th - g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm) } + // showText("\n") } - // showText("\n") case "TL": // set text leading if len(args) != 1 { diff --git a/pdf_test.go b/pdf_test.go index 345ee95..0681b32 100644 --- a/pdf_test.go +++ b/pdf_test.go @@ -3,26 +3,145 @@ package pdf import ( "bytes" "fmt" + "os" + "strconv" + "strings" "testing" + "path/filepath" ) -const testFile = "/Users/dslipak/Documents/dslipak-20190925.pdf" +var referenceFirstPage = `TEST FILE + +Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam +nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam +erat, sed diam voluptua. At vero eos et accusam et +TEST +SUBTITLE` -func TestReadPdf(t *testing.T) { - f, err := Open(testFile) - if err != nil { - t.Error("Doc should not be nil', got ", err) +var referenceFirstPageWithAddLine = `TEST FILE + +Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam +nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam +erat, sed diam voluptua. At vero eos et accusam et + +TEST +SUBTITLE` + +// +// this pdf has an object within stream which is handled different! +// the original implementation calculated the stream but didn't returned the object at resolve +// +// @todo: there is an empty line added, still don't know where +// +func Test_ReadPdf_v17_linarized_xrefStream(t *testing.T) { + + testFile := "./testdata/story_Word2019-2312-1601712620132-32_Print-Adobe__pdf15_linarized_xrefStream.pdf" + totalPages, content := readPdfAndGetFirstPageAsText(testFile) + if totalPages != 5 { + t.Error("Asser: incorrect numPage .. want=5 <> got " + strconv.Itoa(totalPages)) } + if referenceFirstPageWithAddLine != content { + t.Error("Asser: content different from reference:") + t.Error(content) + } +} +func Test_ReadPdf_v17_linarized_xref(t *testing.T) { - totalPage := f.NumPage() - var buf bytes.Buffer + testFile := "./testdata/story_avepdf-com__pdf17_linarized_xref.pdf" + totalPages, content := readPdfAndGetFirstPageAsText(testFile) + if totalPages != 5 { + t.Error("Asser: incorrect numPage .. want=5 <> got " + strconv.Itoa(totalPages)) + } + if referenceFirstPage != content { + t.Error("Asser: content different from reference:") + t.Error(content) + } +} +// +// this pdf has an array of refs at /Contents +// standard: +// page = {<> /MediaBox [0 0 612 792] /Parent 2 0 R /Resources <> /Font <> /ProcSet [/PDF /Text /ImageB /ImageC /ImageI]>> /StructParents 0 /Type /Page>>} +// deviation: +// page = {<>} +// +func Test_ReadPdf_v17_trailer_arrayAtPageContents(t *testing.T) { - for pageIndex := 1; pageIndex <= totalPage; pageIndex++ { - p := f.Page(pageIndex) - if p.V.IsNull() { - continue - } + testFile := "./testdata/story_Word2019-2312-1712620132_Print-Microsoft__pdf17_trailer_array-at-page-contents.pdf" + totalPages, content := readPdfAndGetFirstPageAsText(testFile) + if totalPages != 5 { + t.Error("Asser: incorrect numPage .. want=5 <> got " + strconv.Itoa(totalPages)) + } + if referenceFirstPage != content { + t.Error("Asser: content different from reference:") + t.Error(content) + } +} +func Test_ReadPdf_v17_StandardPDFA_trailer(t *testing.T) { + + testFile := "./testdata/story_Word2019-2312-1712620132_SaveAs-Standard-PDFA__pdf17_trailer.pdf" + totalPages, content := readPdfAndGetFirstPageAsText(testFile) + if totalPages != 5 { + t.Error("Asser: incorrect numPage .. want=5 <> got " + strconv.Itoa(totalPages)) + } + if referenceFirstPage != content { + t.Error("Asser: content different from reference:") + t.Error(content) + } +} +func Test_ReadPdf_v17_MinSizePDFA_trailer(t *testing.T) { + + testFile := "./testdata/story_Word2019-2312-1712620132_SaveAs-MinSize-PDFA__pdf17_trailer.pdf" + totalPages, content := readPdfAndGetFirstPageAsText(testFile) + if totalPages != 5 { + t.Error("Asser: incorrect if totalPages != 5 { .. want=5 <> got " + strconv.Itoa(totalPages)) + } + if referenceFirstPage != content { + t.Error("Asser: content different from reference") + t.Error(content) + } +} +func Test_ReadPdf_v17_StandardNoPDFA_2trailer(t *testing.T) { + + testFile := "./testdata/story_Word2019-2312-1712620132_SaveAs-Standard-NoPDFA__pdf17_2trailer.pdf" + totalPages, content := readPdfAndGetFirstPageAsText(testFile) + if totalPages != 5 { + t.Error("Asser: incorrect totalPages .. want=5 <> got " + strconv.Itoa(totalPages)) + } + if referenceFirstPage != content { + t.Error("Asser: content different from reference") + t.Error(content) + } +} +func Test_ReadPdf_v17_MinSizeNoPDFA_2trailer(t *testing.T) { + + testFile := "./testdata/story_Word2019-2312-1712620132_SaveAs-MinSize-NoPDFA__pdf17_2trailer.pdf" + totalPages, content := readPdfAndGetFirstPageAsText(testFile) + if totalPages != 5 { + t.Error("Asser: incorrect totalPages .. want=5 <> got " + strconv.Itoa(totalPages)) + } + if referenceFirstPage != content { + t.Error("Asser: content different from reference") + t.Error(content) + } +} +// +// read pdf and return content of first page for quick check +// +func readPdfAndGetFirstPageAsText(fileName string) (totalPages int, content string) { + fmt.Println("read file = " + fileName) + + f, err := Open(fileName) + if err != nil { + return 0, err.Error() + } + totalPages = f.NumPage() + if totalPages == 0 { + return totalPages, content + } else { + + var buf bytes.Buffer + p := f.Page(1) texts := p.Content().Text var lastY = 0.0 line := "" @@ -42,7 +161,101 @@ func TestReadPdf(t *testing.T) { lastY = text.Y } buf.WriteString(line) + content = strings.TrimSpace(buf.String()) } - fmt.Println(buf.String()) + + return totalPages, content } +// +// process all pdfs within ./testdata/*.pdf and write content to *.txt +// +func Test_WalkDirectory_ReadPdfs(t *testing.T) { + + // get files + var startPath string = "./testdata" + files, err := walkDir(startPath, ".pdf") + if err != nil { + t.Error("Assert: " + err.Error()) + } + + // read files + for i:=0; i 0 { + buf.WriteString(line + "\n") + line = text.S + } else { + line += text.S + } + } else { + line += text.S + } + + lastY = text.Y + } + buf.WriteString(line) + } + + // + //fmt.Println(buf.String()) + + // + // write bytes buffer to txt-file + writeToFileName := strings.Replace(testFile, ".pdf", ".txt", -1) + fmt.Println(".. writeToFileName = ", writeToFileName) + + fw, err := os.Create(writeToFileName) + if err != nil { + t.Error(err) + } + _, err = fw.WriteString(buf.String()) + if err != nil { + t.Error(err) + } + + fw.Close() + } +} +// +// walk indicated directory and +// return all file.names with indicated suffix +// +func walkDir(root, fileSuffix string) ([]string, error) { + var files []string + err := filepath.Walk(root, func(path string, info os.FileInfo, err error) error { + if !info.IsDir() && strings.HasSuffix(path, fileSuffix) { + files = append(files, path) + } + return nil + }) + return files, err +} \ No newline at end of file diff --git a/read.go b/read.go index 489fcad..04480ec 100644 --- a/read.go +++ b/read.go @@ -726,8 +726,15 @@ func (v Value) Len() int { } return len(x) } - +// +// resolve xrefs +// in: the parent and the key or reference to resolve +// out: the reference +// +// bugfix: in case the object-ref is within a stream than nothing was returned +// func (r *Reader) resolve(parent objptr, x interface{}) Value { + if ptr, ok := x.(objptr); ok { if ptr.id >= uint32(len(r.xref)) { return Value{} @@ -739,6 +746,7 @@ func (r *Reader) resolve(parent objptr, x interface{}) Value { // var obj object if xref.inStream { strm := r.resolve(parent, xref.stream) + Search: for { if strm.Kind() != Stream { @@ -759,10 +767,11 @@ func (r *Reader) resolve(parent objptr, x interface{}) Value { off, _ := b.readToken().(int64) if uint32(id) == ptr.id { b.seekForward(first + off) - _, err := b.readObject() + objinstream, err := b.readObject() if err != nil { return Value{} } + x = objinstream break Search } } diff --git a/testdata/story.docx b/testdata/story.docx new file mode 100644 index 0000000..197a9e2 Binary files /dev/null and b/testdata/story.docx differ diff --git a/testdata/story_Word2019-2312-1601712620132-32_Print-Adobe__pdf15_linarized_xrefStream.pdf b/testdata/story_Word2019-2312-1601712620132-32_Print-Adobe__pdf15_linarized_xrefStream.pdf new file mode 100644 index 0000000..07cad3b Binary files /dev/null and b/testdata/story_Word2019-2312-1601712620132-32_Print-Adobe__pdf15_linarized_xrefStream.pdf differ diff --git a/testdata/story_Word2019-2312-1601712620132-32_SaveAs-Standard-NoPDFA__pdf17_trailer.pdf b/testdata/story_Word2019-2312-1601712620132-32_SaveAs-Standard-NoPDFA__pdf17_trailer.pdf new file mode 100644 index 0000000..2e1901a Binary files /dev/null and b/testdata/story_Word2019-2312-1601712620132-32_SaveAs-Standard-NoPDFA__pdf17_trailer.pdf differ diff --git a/testdata/story_Word2019-2312-1601712620132-32_SaveAs-Standard-PDFA__pdf17_2trailer.pdf b/testdata/story_Word2019-2312-1601712620132-32_SaveAs-Standard-PDFA__pdf17_2trailer.pdf new file mode 100644 index 0000000..fe0e4cf Binary files /dev/null and b/testdata/story_Word2019-2312-1601712620132-32_SaveAs-Standard-PDFA__pdf17_2trailer.pdf differ diff --git a/testdata/story_Word2019-2312-1712620132_Print-Microsoft__pdf17_trailer_array-at-page-contents.pdf b/testdata/story_Word2019-2312-1712620132_Print-Microsoft__pdf17_trailer_array-at-page-contents.pdf new file mode 100644 index 0000000..df5c56a Binary files /dev/null and b/testdata/story_Word2019-2312-1712620132_Print-Microsoft__pdf17_trailer_array-at-page-contents.pdf differ diff --git a/testdata/story_Word2019-2312-1712620132_SaveAs-MinSize-NoPDFA__pdf17_2trailer.pdf b/testdata/story_Word2019-2312-1712620132_SaveAs-MinSize-NoPDFA__pdf17_2trailer.pdf new file mode 100644 index 0000000..45f2362 Binary files /dev/null and b/testdata/story_Word2019-2312-1712620132_SaveAs-MinSize-NoPDFA__pdf17_2trailer.pdf differ diff --git a/testdata/story_Word2019-2312-1712620132_SaveAs-MinSize-PDFA__pdf17_trailer.pdf b/testdata/story_Word2019-2312-1712620132_SaveAs-MinSize-PDFA__pdf17_trailer.pdf new file mode 100644 index 0000000..e6e1233 Binary files /dev/null and b/testdata/story_Word2019-2312-1712620132_SaveAs-MinSize-PDFA__pdf17_trailer.pdf differ diff --git a/testdata/story_Word2019-2312-1712620132_SaveAs-Standard-NoPDFA__pdf17_2trailer.pdf b/testdata/story_Word2019-2312-1712620132_SaveAs-Standard-NoPDFA__pdf17_2trailer.pdf new file mode 100644 index 0000000..8ef1861 Binary files /dev/null and b/testdata/story_Word2019-2312-1712620132_SaveAs-Standard-NoPDFA__pdf17_2trailer.pdf differ diff --git a/testdata/story_Word2019-2312-1712620132_SaveAs-Standard-PDFA__pdf17_trailer.pdf b/testdata/story_Word2019-2312-1712620132_SaveAs-Standard-PDFA__pdf17_trailer.pdf new file mode 100644 index 0000000..996102d Binary files /dev/null and b/testdata/story_Word2019-2312-1712620132_SaveAs-Standard-PDFA__pdf17_trailer.pdf differ diff --git a/testdata/story_avepdf-com__pdf17_linarized_xref.pdf b/testdata/story_avepdf-com__pdf17_linarized_xref.pdf new file mode 100644 index 0000000..d801b2c Binary files /dev/null and b/testdata/story_avepdf-com__pdf17_linarized_xref.pdf differ diff --git a/testdata/story_freeconvert-com__pdf17_2trailer.pdf b/testdata/story_freeconvert-com__pdf17_2trailer.pdf new file mode 100644 index 0000000..e021b2a Binary files /dev/null and b/testdata/story_freeconvert-com__pdf17_2trailer.pdf differ diff --git a/testdata/tableOfContentWithHeader_tcpdf-org__pdf17_trailer.pdf b/testdata/tableOfContentWithHeader_tcpdf-org__pdf17_trailer.pdf new file mode 100644 index 0000000..4d64e9a Binary files /dev/null and b/testdata/tableOfContentWithHeader_tcpdf-org__pdf17_trailer.pdf differ