-
Notifications
You must be signed in to change notification settings - Fork 13
/
features.py
137 lines (108 loc) · 5.16 KB
/
features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python
"""Provides access to all Features of a Document."""
__author__ = 'Brandon Gordon'
__email__ = '[email protected]'
class Features(object):
"""
Provide access to all :class:`Features` of a :class:`~officedissector.doc.Document`.
Features in an OOXML document are identified in two ways:
1. By their Content-Type
2. By their inbound Relationship types
For completeness, we use either means of identification,
allowing either one to identify a feature.
For example:
>>> part = doc.part_by_name['/word/media/image1.jpeg']
>>> part.content_type()
'image/jpeg'
>>> part.relationships_in()[0].type
'http:https://schemas.openxmlformats.org/officeDocument/2006/relationships/image'
Content-Types and Relationships are mainly referenced from the OOXML
specifications at `ISO/IEC:29500-1`_ 15.2 and `ISO/IEC:29500-2`_ 13.2.
.. _`ISO/IEC:29500-2`:
http:https://standards.iso.org/ittf/PubliclyAvailableStandards/c061796_ISO_IEC_29500-2_2012_Electronic_inserts.zip
.. _`ISO/IEC:29500-1`:
http:https://standards.iso.org/ittf/PubliclyAvailableStandards/c061750_ISO_IEC_29500-1_2012.zip
:ivar custom_properties:
:ivar images:
:ivar videos:
:ivar sounds:
:ivar fonts:
:ivar macros:
:ivar comments:
:ivar custom_xml:
:ivar embedded_controls:
:ivar embedded_objects:
:ivar embedded_packages:
:ivar digital_signatures:
"""
def __init__(self, doc):
"""
Initialize the Features object.
:param doc: the :class:`~officedissector.doc.Document` associated with this object
:type doc: :class:`~officedissector.doc.Document`
"""
self.doc = doc
# Schema: self._get_parts([content_type1,
# content_type2...],
# [relationships1,
# relationship2...])
self.custom_properties = self._get_parts(
['application/vnd.openxmlformats-officedocument.custom-properties+xml'],
['custom-properties'])
self.images = self._get_parts(['image/'],
['relationships/image'])
self.videos = self._get_parts(['video/'],
['relationships/video'])
self.sounds = self._get_parts(['audio/'],
['relationships/audio'])
self.fonts = self._get_parts(
['application/x-font',
'application/vnd.openxmlformats-officedocument.obfuscatedFont'],
['relationships/font'])
self.macros = self._get_parts(['application/vnd.ms-office.vbaProject',
'application/vnd.ms-excel.intlmacrosheet+xml'],
['relationships/xlIntlMacrosheet',
'relationships/xlIntlMacrosheet',
'relationships/vbaProject'])
self.comments = self._get_parts(
['application/vnd.openxmlformats-officedocument.wordprocessingml.comments+xml',
'application/vnd.openxmlformats-officedocument.spreadsheetml.comments+xml',
'application/vnd.openxmlformats-officedocument.presentationml.comments+xml'],
['relationships/comments'])
# Note that Custom XML is identified only by Relationship
self.custom_xml = self._get_parts([],
['relationships/customXml'])
self.embedded_controls = self._get_parts(
['application/vnd.ms-office.activeX+xml'],
['relationships/control'])
# Note that embedded objects is identified only by Relationship
self.embedded_objects = self._get_parts([],
['relationships/oleObject'])
# Note that embedded packages is identified only by Relationship
self.embedded_packages = self._get_parts([],
['relationships/package'])
# Identify and provide access to digital signature parts
self.digital_signatures = self._get_parts(
['application/vnd.openxmlformats-package.digital-signaturecertificate,',
'application/vnd.openxmlformats-package.digital-signature-origin',
'application/vnd.openxmlformats-package.digital-signaturexmlsignature+xml'],
['relationships/digitalsignature/signature',
'relationships/digitalsignature/certificate',
'relationships/digitalsignature/origin'])
def _get_parts(self, content_types, rels):
"""
Take content_type and relationships as parameters,
and return all Parts that match.
"""
parts1 = []
for ct in content_types:
for part in self.doc.parts_by_content_type_regex(ct):
parts1.append(part)
parts2 = []
for rel in rels:
for part in self.doc.parts_by_relationship_type(rel):
parts2.append(part)
# Use set() to eliminate duplicates
return list(set(parts1 + parts2))
def __repr__(self):
return "Features of: %s" % self.doc