Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pdf pages to zip #4426

Merged
merged 2 commits into from
Jun 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions cozy.example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,9 @@ jobs:
# path to the imagemagick convert binary
# imagemagick_convert_cmd: convert

# path to the ghostscript binary
# ghostscript_cmd: gs

# Specify whether the given list of jobs is an allowlist or blocklist. In case
# of an allowlist, all jobs are deactivated by default and only the listed one
# are activated.
Expand Down
7 changes: 7 additions & 0 deletions docs/files.md
Original file line number Diff line number Diff line change
Expand Up @@ -1403,6 +1403,10 @@ sub-directories in the archive.
It's possible to give a file by its id (in the `ids` array) or by its path (in
the `files` array).

For PDF files, it's possible to put in the archive a single page, with the
`pages` argument: it's an array of objects, with `id` the file identifier of
the PDF file, and `page` the page number (1 is the first page).

The generated archive is temporary and is not persisted.

#### Request
Expand All @@ -1423,6 +1427,9 @@ Content-Type: application/vnd.api+json
"/Documents/bills",
"/Documents/images/sunset.jpg",
"/Documents/images/eiffel-tower.jpg"
],
"pages": [
{ "id": "3780caf0-104f-013d-3619-18c04daba326", "page": 1 }
]
}
}
Expand Down
7 changes: 6 additions & 1 deletion docs/workers.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ creates a zip archive from files in the VFS. The options are:
- `dir_id`: the directory identifier where the zip archive will be put
- `filename`: the name of the zip archive.

**Note:** it is possible to include only a page for a PDF file, by using an
object with `id` and `page` instead of just the file identifier.

### Example

```json
Expand All @@ -153,7 +156,9 @@ creates a zip archive from files in the VFS. The options are:
"selection/two.pdf": "36eb54c8-90fe-11e9-aeca-03ddc3acf91c",
"selection/three.pdf": "37284586-90fe-11e9-be6d-179f72076e43",
"selection/four.pdf": "37655462-90fe-11e9-9059-8739e3746720",
"selection/five.pdf": "379fedfc-90fe-11e9-849f-0bbe172eba5f"
"selection/five.pdf": "379fedfc-90fe-11e9-849f-0bbe172eba5f",
"selection/front.pdf": { "id": "49ca9e50-1074-013d-361a-18c04daba326", "page": 1 },
"selection/back.pdf": { "id": "49ca9e50-1074-013d-361a-18c04daba326", "page": 2 }
},
"dir_id": "3657ce9c-90fe-11e9-b40b-33baf841bcb8",
"filename": "selection.zip"
Expand Down
56 changes: 48 additions & 8 deletions model/vfs/archive.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"path/filepath"
"strings"

"github.com/cozy/cozy-stack/pkg/config/config"
"github.com/cozy/cozy-stack/pkg/consts"
"github.com/cozy/cozy-stack/pkg/couchdb"
"github.com/labstack/echo/v4"
Expand All @@ -23,17 +24,24 @@ type Archive struct {
Secret string `json:"-"`
IDs []string `json:"ids"`
Files []string `json:"files"`
Pages []Page `json:"pages"`

// archiveEntries cache
entries []ArchiveEntry
}

type Page struct {
ID string `json:"id"`
Page int `json:"page"`
}

// ArchiveEntry is an utility struct to store a file or doc to be placed
// in the archive.
type ArchiveEntry struct {
root string
Dir *DirDoc
File *FileDoc
Page int
}

var plusEscaper = strings.NewReplacer("+", "%20")
Expand Down Expand Up @@ -66,9 +74,8 @@ func ContentDisposition(disposition, filename string) string {
// GetEntries returns all files and folders in the archive as ArchiveEntry.
func (a *Archive) GetEntries(fs VFS) ([]ArchiveEntry, error) {
if a.entries == nil {
n := len(a.IDs)
entries := make([]ArchiveEntry, n+len(a.Files))
for i, id := range a.IDs {
entries := make([]ArchiveEntry, 0, len(a.IDs)+len(a.Files)+len(a.Pages))
for _, id := range a.IDs {
d, f, err := fs.DirOrFileByID(id)
if err != nil {
return nil, err
Expand All @@ -82,22 +89,37 @@ func (a *Archive) GetEntries(fs VFS) ([]ArchiveEntry, error) {
return nil, err
}
}
entries[i] = ArchiveEntry{
entries = append(entries, ArchiveEntry{
root: root,
Dir: d,
File: f,
}
})
}
for i, root := range a.Files {
for _, root := range a.Files {
d, f, err := fs.DirOrFileByPath(root)
if err != nil {
return nil, err
}
entries[n+i] = ArchiveEntry{
entries = append(entries, ArchiveEntry{
root: root,
Dir: d,
File: f,
})
}
for _, page := range a.Pages {
f, err := fs.FileByID(page.ID)
if err != nil {
return nil, err
}
root, err := f.Path(fs)
if err != nil {
return nil, err
}
entries = append(entries, ArchiveEntry{
root: root,
File: f,
Page: page.Page,
})
}

a.entries = entries
Expand Down Expand Up @@ -140,6 +162,9 @@ func (a *Archive) Serve(fs VFS, w http.ResponseWriter) error {
Method: zip.Deflate,
Modified: file.UpdatedAt,
}
if entry.Page >= 0 {
header.Name = addPageToName(header.Name, entry.Page)
}
ze, err := zw.CreateHeader(header)
if err != nil {
return fmt.Errorf("Can't create zip entry <%s>: %s", name, err)
Expand All @@ -149,7 +174,16 @@ func (a *Archive) Serve(fs VFS, w http.ResponseWriter) error {
return fmt.Errorf("Can't open file <%s>: %s", name, err)
}
defer f.Close()
_, err = io.Copy(ze, f)

if entry.Page <= 0 {
_, err = io.Copy(ze, f)
return err
}
extracted, err := config.PDF().ExtractPage(f, entry.Page)
if err != nil {
return err
}
_, err = io.Copy(ze, extracted)
return err
}, 0)
if err != nil {
Expand Down Expand Up @@ -189,3 +223,9 @@ func (a *Archive) SetID(_ string) {}

// SetRev makes Archive a jsonapi.Object
func (a *Archive) SetRev(_ string) {}

func addPageToName(name string, page int) string {
ext := filepath.Ext(name)
basename := strings.TrimSuffix(name, ext)
return fmt.Sprintf("%s (%d)%s", basename, page, ext)
}
12 changes: 12 additions & 0 deletions model/vfs/archive_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
package vfs

import (
"testing"

"github.com/stretchr/testify/require"
)

func TestAddPageToName(t *testing.T) {
name := addPageToName("driving licence.pdf", 2)
require.Equal(t, "driving licence (2).pdf", name)
}
10 changes: 10 additions & 0 deletions pkg/config/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"github.com/cozy/cozy-stack/pkg/limits"
"github.com/cozy/cozy-stack/pkg/lock"
"github.com/cozy/cozy-stack/pkg/logger"
"github.com/cozy/cozy-stack/pkg/pdf"
"github.com/cozy/cozy-stack/pkg/tlsclient"
"github.com/cozy/cozy-stack/pkg/utils"
"github.com/cozy/gomail"
Expand Down Expand Up @@ -116,6 +117,7 @@ type Config struct {
AuthorizedForConfirm []string

Avatars *avatar.Service
PDF *pdf.Service
Fs Fs
Keyring keyring.Keyring
CouchDB CouchDB
Expand Down Expand Up @@ -376,6 +378,11 @@ func Avatars() *avatar.Service {
return config.Avatars
}

// PDF return the configured PDF service.
func PDF() *pdf.Service {
return config.PDF
}

// GetKeyring returns the configured instance of [keyring.Keyring]
func GetKeyring() keyring.Keyring {
return config.Keyring
Expand Down Expand Up @@ -485,6 +492,7 @@ func Setup(cfgFile string) (err error) {

func applyDefaults(v *viper.Viper) {
v.SetDefault("password_reset_interval", defaultPasswordResetInterval)
v.SetDefault("jobs.ghostscript_cmd", "gs")
v.SetDefault("jobs.imagemagick_convert_cmd", "convert")
v.SetDefault("jobs.defaultDurationToKeep", "2W")
v.SetDefault("assets_polling_disabled", false)
Expand Down Expand Up @@ -749,6 +757,7 @@ func UseViper(v *viper.Viper) error {

cacheStorage := cache.New(cacheRedis)
avatars := avatar.NewService(cacheStorage, v.GetString("jobs.imagemagick_convert_cmd"))
pdfService := pdf.NewService(v.GetString("jobs.ghostscript_cmd"))

// Setup keyring
var keyringCfg keyring.Config
Expand Down Expand Up @@ -823,6 +832,7 @@ func UseViper(v *viper.Viper) error {
RemoteAssets: v.GetStringMapString("remote_assets"),

Avatars: avatars,
PDF: pdfService,
Keyring: keyring,
Fs: Fs{
URL: fsURL,
Expand Down
50 changes: 50 additions & 0 deletions pkg/pdf/pdf.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Package pdf is for manipulating PDF files.
package pdf

import (
"bytes"
"fmt"
"io"
"os/exec"

"github.com/cozy/cozy-stack/pkg/logger"
)

// Service provides methods for manipulating PDF files.
type Service struct {
ghostscriptCmd string
}

// NewService instantiate a new [Service].
func NewService(ghostscriptCmd string) *Service {
return &Service{ghostscriptCmd}
}

// ExtractPage extract a page from a PDF.
func (s *Service) ExtractPage(stdin io.Reader, page int) (*bytes.Buffer, error) {
args := []string{
"-q",
"-sDEVICE=pdfwrite",
"-dNOPAUSE",
"-dBATCH",
"-dSAFER",
fmt.Sprintf("-dFirstPage=%d", page),
fmt.Sprintf("-dLastPage=%d", page),
"-sOutputFile=-",
"-", // Use stdin for input
}

var stdout, stderr bytes.Buffer
cmd := exec.Command(s.ghostscriptCmd, args...)
cmd.Stdin = stdin
cmd.Stdout = &stdout
cmd.Stderr = &stderr

if err := cmd.Run(); err != nil {
logger.WithNamespace("pdf").
WithField("stderr", stderr.String()).
Errorf("ghostscript failed: %s", err)
return nil, fmt.Errorf("failed to run the cmd %q: %w", s.ghostscriptCmd, err)
}
return &stdout, nil
}
38 changes: 38 additions & 0 deletions pkg/pdf/pdf_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package pdf

import (
"os"
"os/exec"
"testing"

"github.com/stretchr/testify/require"
)

func Test_Extract_Page(t *testing.T) {
if testing.Short() {
t.Skipf("this test require the \"gs\" binary, skip it due to the \"--short\" flag")
}

service := NewService("gs")
input, err := os.Open("../../tests/fixtures/dev-desktop.pdf")
require.NoError(t, err)
defer input.Close()

extracted, err := service.ExtractPage(input, 1)
require.NoError(t, err)

// We cannot compare the output to an expected PDF file, as there many
// things that change from one run to another: CreationDate, uuid, etc.
// So, we are checking that it's a PDF, and it has the expected signature
// from ImageMagick.
content := extracted.Bytes()
start := []byte("%PDF-1.7")
require.Equal(t, start, content[:len(start)])

expected := "5b49b84d59866b2f6d825957c55c2c2681656e5de52e30c1439fbaf197fe1d14"
cmd := exec.Command("identify", "-quiet", "-format", "%#", "-")
cmd.Stdin = extracted
signature, err := cmd.Output()
require.NoError(t, err)
require.Equal(t, []byte(expected), signature)
}
1 change: 1 addition & 0 deletions scripts/docker/cozy-app-dev/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ RUN set -eux; apt-get update \
openssl \
fonts-lato \
imagemagick \
ghostscript \
git \
&& curl https://couchdb.apache.org/repo/keys.asc | gpg --dearmor > /usr/share/keyrings/couchdb-archive-keyring.gpg \
&& echo "deb [signed-by=/usr/share/keyrings/couchdb-archive-keyring.gpg] https://apache.jfrog.io/artifactory/couchdb-deb/ bullseye main" > /etc/apt/sources.list.d/couchdb.list \
Expand Down
2 changes: 1 addition & 1 deletion worker/archive/unzip_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ func Test_archive(t *testing.T) {
assert.NoError(t, err)
assert.NoError(t, file.Close())

files := map[string]string{
files := map[string]interface{}{
"wet-cozy.jpg": one.ID(),
"hello.txt": two.ID(),
}
Expand Down
Loading
Loading