-
Notifications
You must be signed in to change notification settings - Fork 47
/
ve.utils.parsing.js
282 lines (263 loc) · 9.84 KB
/
ve.utils.parsing.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
/*!
* VisualEditor parsing utilities, used when converting HTMLDocuments and strings.
*
* @copyright See AUTHORS.txt
*/
/**
* Check whether a given DOM element has a block element type.
*
* @param {HTMLElement|string} element Element or element name
* @return {boolean} Element is a block element
*/
ve.isBlockElement = function ( element ) {
const elementName = typeof element === 'string' ? element : element.nodeName;
return ve.elementTypes.block.indexOf( elementName.toLowerCase() ) !== -1;
};
/**
* Check whether a given DOM element is a void element (can't have children).
*
* @param {HTMLElement|string} element Element or element name
* @return {boolean} Element is a void element
*/
ve.isVoidElement = function ( element ) {
const elementName = typeof element === 'string' ? element : element.nodeName;
return ve.elementTypes.void.indexOf( elementName.toLowerCase() ) !== -1;
};
ve.elementTypes = {
block: [
'div', 'p',
// Tables
'table', 'tbody', 'thead', 'tfoot', 'caption', 'th', 'tr', 'td',
// Lists
'ul', 'ol', 'li', 'dl', 'dt', 'dd',
// HTML5 heading content
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hgroup',
// HTML5 sectioning content
'article', 'aside', 'body', 'nav', 'section', 'footer', 'header', 'figure',
'figcaption', 'fieldset', 'details', 'blockquote',
// Other
'hr', 'button', 'canvas', 'center', 'col', 'colgroup', 'embed',
'map', 'object', 'pre', 'progress', 'video'
],
void: [
// https://html.spec.whatwg.org/#void-elements
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
'link', 'meta', 'param', 'source', 'track', 'wbr'
]
};
/**
* Match a specific HTML tag that appears once, e.g. 'html' or 'body'
*
* @param {string} html Document HTML
* @param {string} tag Tag name
* @return {string[]|null} Regex match, null if not found
*/
ve.matchTag = function ( html, tag ) {
return html.match(
new RegExp( '<' + tag + '(>|\\s[^>]*>)' )
);
};
/**
* Add a tag to `<head>` using HTML string splicing
*
* @param {string} docHtml Document HTML
* @param {string} tagHtml Tag HTML to be added to `<head>`
* @return {string} Document HTML
*/
ve.addHeadTag = function ( docHtml, tagHtml ) {
/**
* Splice text after a regex match
*
* @param {string[]} match Regex match
* @param {string} text Text to insert
* @return {string}
*/
function insertAfter( match, text ) {
const offset = match.index + match[ 0 ].length;
return docHtml.slice( 0, offset ) +
text +
docHtml.slice( offset );
}
const headMatch = ve.matchTag( docHtml, 'head' );
if ( headMatch ) {
return insertAfter( headMatch, tagHtml );
} else {
const htmlMatch = ve.matchTag( docHtml, 'html' );
if ( htmlMatch ) {
// <html> but no <head>
return insertAfter( htmlMatch, '<head>' + tagHtml + '</head>' );
} else {
// No <html> or </head>
return '<head>' + tagHtml + '</head>' + docHtml;
}
}
};
/**
* Create an HTMLDocument from an HTML string.
*
* The html parameter is supposed to be a full HTML document with a doctype and an `<html>` tag.
* If you pass a document fragment, it will be wrapped in `<body>…</body>`.
*
* To create an empty document, pass the empty string.
*
* If your input is both valid HTML and valid XML, and you need to work around style
* normalization bugs in Internet Explorer, use #parseXhtml and #serializeXhtml.
*
* @param {string} html
* @return {HTMLDocument} Document constructed from the HTML string
*/
ve.createDocumentFromHtml = function ( html ) {
if ( html !== '' ) {
if ( !ve.matchTag( html, 'body' ) ) {
// When the given HTML fragment starts with a <meta> or <style> element, it is placed in the
// automatically generated <head> rather than <body>, and breaks our assumptions. (T273234)
html = '<body>' + html + '</body>';
}
// Add iOS hack (T116525)
html = ve.addHeadTag( html, '<meta name="format-detection" content="telephone=no" data-ve-tmp/>' );
}
const newDocument = new DOMParser().parseFromString( html, 'text/html' );
// Remove iOS hack
const tmpMeta = newDocument.querySelector( 'meta[data-ve-tmp]' );
if ( tmpMeta ) {
tmpMeta.parentNode.removeChild( tmpMeta );
}
return newDocument;
};
/**
* Take a target document with a possibly relative base URL, and modify it to be absolute.
* The base URL of the target document is resolved using the base URL of the source document.
*
* Note that the fallbackBase parameter will be used if there is no <base> tag, even if
* the document does have a valid base URL: this is to work around Firefox's behavior of having
* documents created by DOMParser inherit the base URL of the main document.
*
* @param {HTMLDocument} targetDoc Document whose base URL should be resolved
* @param {HTMLDocument} sourceDoc Document whose base URL should be used for resolution
* @param {string} [fallbackBase] Base URL to use if resolving the base URL fails or there is no <base> tag
*/
ve.fixBase = function ( targetDoc, sourceDoc, fallbackBase ) {
let baseNode = targetDoc.getElementsByTagName( 'base' )[ 0 ];
if ( baseNode ) {
// Support: Safari
// In Safari a base node with an invalid href (e.g. protocol-relative)
// in a document which has been dynamically created results in
// 'about:blank' rather than '' or null. The base's href will also be '',
// but that works out just setting the base to fallbackBase, so it's okay.
if ( !targetDoc.baseURI || targetDoc.baseURI === 'about:blank' ) {
// <base> tag present but not valid, try resolving its URL
baseNode.setAttribute( 'href', ve.resolveUrl( baseNode.getAttribute( 'href' ), sourceDoc ) );
if ( !targetDoc.baseURI && fallbackBase ) {
// That didn't work, use the fallback
baseNode.setAttribute( 'href', fallbackBase );
}
}
// Support: Chrome
// Chrome just entirely ignores <base> tags with a protocol-relative href attribute.
// Code below is *not a no-op*; reading the href property and setting it back
// will expand the href *attribute* to use an absolute URL if it was relative.
// eslint-disable-next-line no-self-assign
baseNode.href = baseNode.href;
} else if ( fallbackBase ) {
// Support: Firefox
// No <base> tag, add one
baseNode = targetDoc.createElement( 'base' );
baseNode.setAttribute( 'href', fallbackBase );
targetDoc.head.appendChild( baseNode );
}
};
/**
* Get the actual inner HTML of a DOM node.
*
* In most browsers, .innerHTML is broken and eats newlines in `<pre>` elements, see
* https://bugzilla.mozilla.org/show_bug.cgi?id=838954 . This function detects this behavior
* and works around it, to the extent possible. `<pre>\nFoo</pre>` will become `<pre>Foo</pre>`
* if the browser is broken, but newlines are preserved in all other cases.
*
* @param {HTMLElement} element HTML element to get inner HTML of
* @return {string} Inner HTML
*/
ve.properInnerHtml = function ( element ) {
return ve.fixupPreBug( element ).innerHTML;
};
/**
* Get the actual outer HTML of a DOM node.
*
* @see ve.properInnerHtml
* @param {HTMLElement} element HTML element to get outer HTML of
* @return {string} Outer HTML
*/
ve.properOuterHtml = function ( element ) {
return ve.fixupPreBug( element ).outerHTML;
};
/**
* Helper function for #properInnerHtml, #properOuterHtml and #serializeXhtml.
*
* Detect whether the browser has broken `<pre>` serialization, and if so return a clone
* of the node with extra newlines added to make it serialize properly. If the browser is not
* broken, just return the original node.
*
* @param {HTMLElement} element HTML element to fix up
* @return {HTMLElement} Either element, or a fixed-up clone of it
*/
ve.fixupPreBug = function ( element ) {
// Support: Chrome, FF
if ( ve.isPreInnerHtmlBroken === undefined ) {
// Test whether newlines in `<pre>` are serialized back correctly
const div = document.createElement( 'div' );
div.innerHTML = '<pre>\n\n</pre>';
ve.isPreInnerHtmlBroken = div.innerHTML === '<pre>\n</pre>';
}
if ( !ve.isPreInnerHtmlBroken ) {
return element;
}
// Workaround for T44469: if a `<pre>` starts with a newline, that means .innerHTML will
// screw up and stringify it with one fewer newline. Work around this by adding a newline.
// If we don't see a leading newline, we still don't know if the original HTML was
// `<pre>Foo</pre>` or `<pre>\nFoo</pre>`, but that's a syntactic difference, not a
// semantic one, and handling that is the integration target's job.
const $element = $( element ).clone();
$element.find( 'pre, textarea, listing' ).each( ( i, el ) => {
let matches;
if ( el.firstChild && el.firstChild.nodeType === Node.TEXT_NODE ) {
matches = el.firstChild.data.match( /^(\r\n|\r|\n)/ );
if ( matches && matches[ 1 ] ) {
// Prepend a newline exactly like the one we saw
el.firstChild.insertData( 0, matches[ 1 ] );
}
}
} );
return $element.get( 0 );
};
/**
* Helper function for #transformStyleAttributes.
*
* Normalize an attribute value. In compliant browsers, this should be
* a no-op, but in IE style attributes are normalized on all elements,
* color and bgcolor attributes are normalized on some elements (like `<tr>`),
* and width and height attributes are normalized on some elements( like `<table>`).
*
* @param {string} name Attribute name
* @param {string} value Attribute value
* @param {string} [nodeName='div'] Element name
* @return {string} Normalized attribute value
*/
ve.normalizeAttributeValue = function ( name, value, nodeName ) {
const node = document.createElement( nodeName || 'div' );
node.setAttribute( name, value );
return node.getAttribute( name );
};
/**
* Resolve a URL relative to a given base.
*
* @param {string} url URL to resolve
* @param {HTMLDocument} base Document whose base URL to use
* @return {string} Resolved URL
*/
ve.resolveUrl = function ( url, base ) {
const node = base.createElement( 'a' );
node.setAttribute( 'href', url );
// If doc.baseURI isn't set, node.href will be an empty string
// This is crazy, returning the original URL is better
return node.href || url;
};