Skip to content

Commit

Permalink
Merge pull request #5 from mazeForGit/main
Browse files Browse the repository at this point in the history
multiple bugfixes
  • Loading branch information
dslipak committed Jan 24, 2024
2 parents 3b0849d + 4da8881 commit 636e0c0
Show file tree
Hide file tree
Showing 15 changed files with 287 additions and 33 deletions.
68 changes: 50 additions & 18 deletions page.go
Original file line number Diff line number Diff line change
Expand Up @@ -752,17 +752,46 @@ func (p Page) walkTextBlocks(walker func(enc TextEncoding, x, y float64, s strin
}
})
}

//
// Content returns the page's content.
//
// bugfix:
// the /Content may contain an array of refs
// this leads to an endless loop
//
func (p Page) Content() Content {

var text []Text
var rect []Rect

//fmt.Println("page=",p)
strm := p.V.Key("Contents")

if strm.Len() == 0 {
c := p.readContent(strm)
text = c.Text
rect = c.Rect
} else {
for i := 0; i < strm.Len(); i++ {
strmindex := strm.Index(i)
//fmt.Println("stream ",i,"=",strmindex)

c := p.readContent(strmindex)
text = append(text, c.Text...)
rect = append(rect, c.Rect...)
}
}
return Content{text, rect}
}

func (p Page) readContent(strm Value) Content {
var enc TextEncoding = &nopEncoder{}

var g = gstate{
Th: 1,
CTM: ident,
}

var text []Text
showText := func(s string) {
n := 0
Expand Down Expand Up @@ -843,9 +872,10 @@ func (p Page) Content() Content {

case "Q": // restore graphics state
n := len(gstack) - 1
g = gstack[n]
gstack = gstack[:n]

if n >= 0 { // bugfix: don't raise an exception
g = gstack[n]
gstack = gstack[:n]
}
case "BT": // begin text (reset text matrix and line matrix)
g.Tm = ident
g.Tlm = g.Tm
Expand Down Expand Up @@ -914,23 +944,25 @@ func (p Page) Content() Content {
showText(args[0].RawString())

case "TJ": // show text, allowing individual glyph positioning
v := args[0]
for i := 0; i < v.Len(); i++ {
x := v.Index(i)
if x.Kind() == String {
if i == v.Len()-1 {
showText(x.RawString())
op = "BT"
continue
if len(args) > 0 { // bugfix: don't raise an exception
v := args[0]
for i := 0; i < v.Len(); i++ {
x := v.Index(i)
if x.Kind() == String {
if i == v.Len()-1 {
showText(x.RawString())
op = "BT"
continue
} else {
showText(x.RawString())
}
} else {
showText(x.RawString())
tx := -x.Float64() / 1000 * g.Tfs * g.Th
g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm)
}
} else {
tx := -x.Float64() / 1000 * g.Tfs * g.Th
g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm)
}
// showText("\n")
}
// showText("\n")

case "TL": // set text leading
if len(args) != 1 {
Expand Down
239 changes: 226 additions & 13 deletions pdf_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,145 @@ package pdf
import (
"bytes"
"fmt"
"os"
"strconv"
"strings"
"testing"
"path/filepath"
)

const testFile = "/Users/dslipak/Documents/dslipak-20190925.pdf"
var referenceFirstPage = `TEST FILE
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam
nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam
erat, sed diam voluptua. At vero eos et accusam et
TEST
SUBTITLE`

func TestReadPdf(t *testing.T) {
f, err := Open(testFile)
if err != nil {
t.Error("Doc should not be nil', got ", err)
var referenceFirstPageWithAddLine = `TEST FILE
Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam
nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam
erat, sed diam voluptua. At vero eos et accusam et
TEST
SUBTITLE`

//
// this pdf has an object within stream which is handled different!
// the original implementation calculated the stream but didn't returned the object at resolve
//
// @todo: there is an empty line added, still don't know where
//
func Test_ReadPdf_v17_linarized_xrefStream(t *testing.T) {

testFile := "./testdata/story_Word2019-2312-1601712620132-32_Print-Adobe__pdf15_linarized_xrefStream.pdf"
totalPages, content := readPdfAndGetFirstPageAsText(testFile)
if totalPages != 5 {
t.Error("Asser: incorrect numPage .. want=5 <> got " + strconv.Itoa(totalPages))
}
if referenceFirstPageWithAddLine != content {
t.Error("Asser: content different from reference:")
t.Error(content)
}
}
func Test_ReadPdf_v17_linarized_xref(t *testing.T) {

totalPage := f.NumPage()
var buf bytes.Buffer
testFile := "./testdata/story_avepdf-com__pdf17_linarized_xref.pdf"
totalPages, content := readPdfAndGetFirstPageAsText(testFile)
if totalPages != 5 {
t.Error("Asser: incorrect numPage .. want=5 <> got " + strconv.Itoa(totalPages))
}
if referenceFirstPage != content {
t.Error("Asser: content different from reference:")
t.Error(content)
}
}
//
// this pdf has an array of refs at /Contents
// standard:
// page = {<</Contents 4 0 R /Group <</CS /DeviceRGB /S /Transparency /Type /Group>> /MediaBox [0 0 612 792] /Parent 2 0 R /Resources <</ExtGState <</GS7 7 0 R /GS8 8 0 R>> /Font <</F1 5 0 R /F2 9 0 R /F3 11 0 R>> /ProcSet [/PDF /Text /ImageB /ImageC /ImageI]>> /StructParents 0 /Type /Page>>}
// deviation:
// page = {<</Contents [20 0 R] /CropBox [0 0 595.32001 841.92004] /MediaBox [0 0 595.32001 841.92004] /Parent 2 0 R /Resources 21 0 R /Rotate 0 /Type /Page>>}
//
func Test_ReadPdf_v17_trailer_arrayAtPageContents(t *testing.T) {

for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
p := f.Page(pageIndex)
if p.V.IsNull() {
continue
}
testFile := "./testdata/story_Word2019-2312-1712620132_Print-Microsoft__pdf17_trailer_array-at-page-contents.pdf"
totalPages, content := readPdfAndGetFirstPageAsText(testFile)
if totalPages != 5 {
t.Error("Asser: incorrect numPage .. want=5 <> got " + strconv.Itoa(totalPages))
}
if referenceFirstPage != content {
t.Error("Asser: content different from reference:")
t.Error(content)
}
}
func Test_ReadPdf_v17_StandardPDFA_trailer(t *testing.T) {

testFile := "./testdata/story_Word2019-2312-1712620132_SaveAs-Standard-PDFA__pdf17_trailer.pdf"
totalPages, content := readPdfAndGetFirstPageAsText(testFile)
if totalPages != 5 {
t.Error("Asser: incorrect numPage .. want=5 <> got " + strconv.Itoa(totalPages))
}
if referenceFirstPage != content {
t.Error("Asser: content different from reference:")
t.Error(content)
}
}
func Test_ReadPdf_v17_MinSizePDFA_trailer(t *testing.T) {

testFile := "./testdata/story_Word2019-2312-1712620132_SaveAs-MinSize-PDFA__pdf17_trailer.pdf"
totalPages, content := readPdfAndGetFirstPageAsText(testFile)
if totalPages != 5 {
t.Error("Asser: incorrect if totalPages != 5 { .. want=5 <> got " + strconv.Itoa(totalPages))
}
if referenceFirstPage != content {
t.Error("Asser: content different from reference")
t.Error(content)
}
}
func Test_ReadPdf_v17_StandardNoPDFA_2trailer(t *testing.T) {

testFile := "./testdata/story_Word2019-2312-1712620132_SaveAs-Standard-NoPDFA__pdf17_2trailer.pdf"
totalPages, content := readPdfAndGetFirstPageAsText(testFile)
if totalPages != 5 {
t.Error("Asser: incorrect totalPages .. want=5 <> got " + strconv.Itoa(totalPages))
}
if referenceFirstPage != content {
t.Error("Asser: content different from reference")
t.Error(content)
}
}
func Test_ReadPdf_v17_MinSizeNoPDFA_2trailer(t *testing.T) {

testFile := "./testdata/story_Word2019-2312-1712620132_SaveAs-MinSize-NoPDFA__pdf17_2trailer.pdf"
totalPages, content := readPdfAndGetFirstPageAsText(testFile)
if totalPages != 5 {
t.Error("Asser: incorrect totalPages .. want=5 <> got " + strconv.Itoa(totalPages))
}
if referenceFirstPage != content {
t.Error("Asser: content different from reference")
t.Error(content)
}
}
//
// read pdf and return content of first page for quick check
//
func readPdfAndGetFirstPageAsText(fileName string) (totalPages int, content string) {
fmt.Println("read file = " + fileName)

f, err := Open(fileName)
if err != nil {
return 0, err.Error()
}

totalPages = f.NumPage()
if totalPages == 0 {
return totalPages, content
} else {

var buf bytes.Buffer
p := f.Page(1)
texts := p.Content().Text
var lastY = 0.0
line := ""
Expand All @@ -42,7 +161,101 @@ func TestReadPdf(t *testing.T) {
lastY = text.Y
}
buf.WriteString(line)
content = strings.TrimSpace(buf.String())
}
fmt.Println(buf.String())

return totalPages, content
}
//
// process all pdfs within ./testdata/*.pdf and write content to *.txt
//
func Test_WalkDirectory_ReadPdfs(t *testing.T) {

// get files
var startPath string = "./testdata"
files, err := walkDir(startPath, ".pdf")
if err != nil {
t.Error("Assert: " + err.Error())
}

// read files
for i:=0; i<len(files); i++ {

testFile := files[i]
if !strings.HasSuffix(testFile, ".pdf") {
continue
}

fmt.Println(". open testFile = ", testFile)
f, err := Open(testFile)
if err != nil {
t.Error(err)
}

totalPage := f.NumPage()
fmt.Println(". totalPage = ", totalPage)

var buf bytes.Buffer

for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {

p := f.Page(pageIndex)
if p.V.IsNull() {
continue
}

texts := p.Content().Text
var lastY = 0.0
line := ""

for _, text := range texts {
if lastY != text.Y {
if lastY > 0 {
buf.WriteString(line + "\n")
line = text.S
} else {
line += text.S
}
} else {
line += text.S
}

lastY = text.Y
}
buf.WriteString(line)
}

//
//fmt.Println(buf.String())

//
// write bytes buffer to txt-file
writeToFileName := strings.Replace(testFile, ".pdf", ".txt", -1)
fmt.Println(".. writeToFileName = ", writeToFileName)

fw, err := os.Create(writeToFileName)
if err != nil {
t.Error(err)
}
_, err = fw.WriteString(buf.String())
if err != nil {
t.Error(err)
}

fw.Close()
}
}
//
// walk indicated directory and
// return all file.names with indicated suffix
//
func walkDir(root, fileSuffix string) ([]string, error) {
var files []string
err := filepath.Walk(root, func(path string, info os.FileInfo, err error) error {
if !info.IsDir() && strings.HasSuffix(path, fileSuffix) {
files = append(files, path)
}
return nil
})
return files, err
}
Loading

0 comments on commit 636e0c0

Please sign in to comment.