-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdf.py
84 lines (70 loc) · 2.84 KB
/
pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python
# pdf.py - print out the pdf versions of every pdf file in a directory
import os
import re
import sys
from enum import Enum, auto, unique
from token_stream import EToken, TokenStream
from object_stream import EObject, ObjectStream
#-------------------------------------------------------------------------------
# I want stdout to be unbuffered, always
#-------------------------------------------------------------------------------
class Unbuffered(object):
def __init__(self, stream):
self.stream = stream
def write(self, data):
self.stream.write(data)
self.stream.flush()
def __getattr__(self, attr):
return getattr(self.stream, attr)
import sys
sys.stdout = Unbuffered(sys.stdout)
#-------------------------------------------------------------------------------
# parse_tokens
#-------------------------------------------------------------------------------
def parse_tokens(filepath):
# Array for token storage
tokens = []
# Parse a character stream into a token stream
with open(filepath, 'rb') as f:
tk = TokenStream(filepath, f)
# tk.cc = tk.bf.next_byte()
indent = 0
while True:
t = tk.next_token()
if t.type == EToken.EOF:
break
if t.type in [EToken.ARRAY_END, EToken.DICT_END, EToken.OBJECT_END]:
indent -= 1
t.print_indented(indent)
if t.type in [EToken.ARRAY_BEGIN, EToken.DICT_BEGIN, EToken.OBJECT_BEGIN]:
indent += 1
tokens.append(t)
#-------------------------------------------------------------------------------
# parse_objects
#-------------------------------------------------------------------------------
def parse_objects(filepath):
# Parse a character stream into a object stream
with open(filepath, 'rb') as f:
ob = ObjectStream(filepath, f)
while True:
o = ob.next_object()
if o.type == EObject.EOF:
break
print(o.show())
#-------------------------------------------------------------------------------
# main
#-------------------------------------------------------------------------------
if __name__ == '__main__':
# Check cmd line args
if len(sys.argv) < 2:
print(f'usage: {sys.argv[0]} <filepath>')
exit(-1)
filepath = sys.argv[1]
# WARNING: you cannot read a pdf file by looping over the objects from the
# beginning. Example: in CNIL-PIA-3-BonnesPratiques.pdf, there is a stream
# object with number 6086, where the Length value is given as an indirect
# object reference 6099 0 R, and the definition for that object comes later
# in the file.
# parse_tokens(filepath)
parse_objects(filepath)