1+ /*!
2+ * HTML Parser By John Resig (ejohn.org)
3+ * Modified by Juriy "kangax" Zaytsev, Evan You and Vue.js community
4+ * Original code by Erik Arvidsson, Mozilla Public License
5+ * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js
6+ */
7+
8+ import { makeMap , no } from './utils'
9+
10+ // HTML5 tags https://html.spec.whatwg.org/multipage/indices.html#elements-3
11+ // Phrasing Content https://html.spec.whatwg.org/multipage/dom.html#phrasing-content
12+ const isNonPhrasingTag = makeMap (
13+ 'address,article,aside,base,blockquote,body,caption,col,colgroup,dd,' +
14+ 'details,dialog,div,dl,dt,fieldset,figcaption,figure,footer,form,' +
15+ 'h1,h2,h3,h4,h5,h6,head,header,hgroup,hr,html,legend,li,menuitem,meta,' +
16+ 'optgroup,option,param,rp,rt,source,style,summary,tbody,td,tfoot,th,thead,' +
17+ 'title,tr,track'
18+ )
19+
20+ // Regular Expressions for parsing tags and attributes
21+ const attribute = / ^ \s * ( [ ^ \s " ' < > \/ = ] + ) (?: \s * ( = ) \s * (?: " ( [ ^ " ] * ) " + | ' ( [ ^ ' ] * ) ' + | ( [ ^ \s " ' = < > ` ] + ) ) ) ? /
22+ // could use https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-QName
23+ // but for Vue templates we can enforce a simple charset
24+ const ncname = '[a-zA-Z_][\\w\\-\\.]*'
25+ const qnameCapture = `((?:${ ncname } \\:)?${ ncname } )`
26+ const startTagOpen = new RegExp ( `^<${ qnameCapture } ` )
27+ const startTagClose = / ^ \s * ( \/ ? ) > /
28+ const endTag = new RegExp ( `^<\\/${ qnameCapture } [^>]*>` )
29+ const doctype = / ^ < ! D O C T Y P E [ ^ > ] + > / i
30+ // #7298: escape - to avoid being pased as HTML comment when inlined in page
31+ const comment = / ^ < ! \- - /
32+ const conditionalComment = / ^ < ! \[ /
33+
34+ let IS_REGEX_CAPTURING_BROKEN = false
35+ 'x' . replace ( / x ( .) ? / g, function ( m , g ) {
36+ IS_REGEX_CAPTURING_BROKEN = g === ''
37+ } )
38+
39+ // Special Elements (can contain anything)
40+ export const isPlainTextElement = makeMap ( 'script,style,textarea' , true )
41+ const reCache = { }
42+
43+ const decodingMap = {
44+ '<' : '<' ,
45+ '>' : '>' ,
46+ '"' : '"' ,
47+ '&' : '&' ,
48+ ' ' : '\n' ,
49+ '	' : '\t'
50+ }
51+ const encodedAttr = / & (?: l t | g t | q u o t | a m p ) ; / g
52+ const encodedAttrWithNewLines = / & (?: l t | g t | q u o t | a m p | # 1 0 | # 9 ) ; / g
53+
54+ // #5992
55+ const isIgnoreNewlineTag = makeMap ( 'pre,textarea' , true )
56+ const shouldIgnoreFirstNewline = ( tag , html ) => tag && isIgnoreNewlineTag ( tag ) && html [ 0 ] === '\n'
57+
58+ function decodeAttr ( value , shouldDecodeNewlines ) {
59+ const re = shouldDecodeNewlines ? encodedAttrWithNewLines : encodedAttr
60+ return value . replace ( re , match => decodingMap [ match ] )
61+ }
62+
63+ export function parseHTML ( html , options ) {
64+ const stack = [ ]
65+ const expectHTML = options . expectHTML
66+ const isUnaryTag = options . isUnaryTag || no
67+ const canBeLeftOpenTag = options . canBeLeftOpenTag || no
68+ let index = 0
69+ let last , lastTag
70+ while ( html ) {
71+ last = html
72+ // Make sure we're not in a plaintext content element like script/style
73+ if ( ! lastTag || ! isPlainTextElement ( lastTag ) ) {
74+ let textEnd = html . indexOf ( '<' )
75+ if ( textEnd === 0 ) {
76+ // Comment:
77+ if ( comment . test ( html ) ) {
78+ const commentEnd = html . indexOf ( '-->' )
79+
80+ if ( commentEnd >= 0 ) {
81+ if ( options . shouldKeepComment ) {
82+ options . comment ( html . substring ( 4 , commentEnd ) )
83+ }
84+ advance ( commentEnd + 3 )
85+ continue
86+ }
87+ }
88+
89+ // http://en.wikipedia.org/wiki/Conditional_comment#Downlevel-revealed_conditional_comment
90+ if ( conditionalComment . test ( html ) ) {
91+ const conditionalEnd = html . indexOf ( ']>' )
92+
93+ if ( conditionalEnd >= 0 ) {
94+ advance ( conditionalEnd + 2 )
95+ continue
96+ }
97+ }
98+
99+ // Doctype:
100+ const doctypeMatch = html . match ( doctype )
101+ if ( doctypeMatch ) {
102+ advance ( doctypeMatch [ 0 ] . length )
103+ continue
104+ }
105+
106+ // End tag:
107+ const endTagMatch = html . match ( endTag )
108+ if ( endTagMatch ) {
109+ const curIndex = index
110+ advance ( endTagMatch [ 0 ] . length )
111+ parseEndTag ( endTagMatch [ 1 ] , curIndex , index )
112+ continue
113+ }
114+
115+ // Start tag:
116+ const startTagMatch = parseStartTag ( )
117+ if ( startTagMatch ) {
118+ handleStartTag ( startTagMatch )
119+ if ( shouldIgnoreFirstNewline ( lastTag , html ) ) {
120+ advance ( 1 )
121+ }
122+ continue
123+ }
124+ }
125+
126+ let text , rest , next
127+ if ( textEnd >= 0 ) {
128+ rest = html . slice ( textEnd )
129+ while (
130+ ! endTag . test ( rest ) &&
131+ ! startTagOpen . test ( rest ) &&
132+ ! comment . test ( rest ) &&
133+ ! conditionalComment . test ( rest )
134+ ) {
135+ // < in plain text, be forgiving and treat it as text
136+ next = rest . indexOf ( '<' , 1 )
137+ if ( next < 0 ) break
138+ textEnd += next
139+ rest = html . slice ( textEnd )
140+ }
141+ text = html . substring ( 0 , textEnd )
142+ advance ( textEnd )
143+ }
144+
145+ if ( textEnd < 0 ) {
146+ text = html
147+ html = ''
148+ }
149+
150+ if ( options . chars && text ) {
151+ options . chars ( text )
152+ }
153+ } else {
154+ let endTagLength = 0
155+ const stackedTag = lastTag . toLowerCase ( )
156+ const reStackedTag = reCache [ stackedTag ] || ( reCache [ stackedTag ] = new RegExp ( '([\\s\\S]*?)(</' + stackedTag + '[^>]*>)' , 'i' ) )
157+ const rest = html . replace ( reStackedTag , function ( all , text , endTag ) {
158+ endTagLength = endTag . length
159+ if ( ! isPlainTextElement ( stackedTag ) && stackedTag !== 'noscript' ) {
160+ text = text
161+ . replace ( / < ! \- - ( [ \s \S ] * ?) - - > / g, '$1' ) // #7298
162+ . replace ( / < ! \[ C D A T A \[ ( [ \s \S ] * ?) ] ] > / g, '$1' )
163+ }
164+ if ( shouldIgnoreFirstNewline ( stackedTag , text ) ) {
165+ text = text . slice ( 1 )
166+ }
167+ if ( options . chars ) {
168+ options . chars ( text )
169+ }
170+ return ''
171+ } )
172+ index += html . length - rest . length
173+ html = rest
174+ parseEndTag ( stackedTag , index - endTagLength , index )
175+ }
176+
177+ if ( html === last ) {
178+ options . chars && options . chars ( html )
179+ if ( process . env . NODE_ENV !== 'production' && ! stack . length && options . warn ) {
180+ options . warn ( `Mal-formatted tag at end of template: "${ html } "` )
181+ }
182+ break
183+ }
184+ }
185+
186+ // Clean up any remaining tags
187+ parseEndTag ( )
188+
189+ function advance ( n ) {
190+ index += n
191+ html = html . substring ( n )
192+ }
193+
194+ function parseStartTag ( ) {
195+ const start = html . match ( startTagOpen )
196+ if ( start ) {
197+ const match = {
198+ tagName : start [ 1 ] ,
199+ attrs : [ ] ,
200+ start : index
201+ }
202+ advance ( start [ 0 ] . length )
203+ let end , attr
204+ while ( ! ( end = html . match ( startTagClose ) ) && ( attr = html . match ( attribute ) ) ) {
205+ advance ( attr [ 0 ] . length )
206+ match . attrs . push ( attr )
207+ }
208+ if ( end ) {
209+ match . unarySlash = end [ 1 ]
210+ advance ( end [ 0 ] . length )
211+ match . end = index
212+ return match
213+ }
214+ }
215+ }
216+
217+ function handleStartTag ( match ) {
218+ const tagName = match . tagName
219+ const unarySlash = match . unarySlash
220+
221+ if ( expectHTML ) {
222+ if ( lastTag === 'p' && isNonPhrasingTag ( tagName ) ) {
223+ parseEndTag ( lastTag )
224+ }
225+ if ( canBeLeftOpenTag ( tagName ) && lastTag === tagName ) {
226+ parseEndTag ( tagName )
227+ }
228+ }
229+
230+ const unary = isUnaryTag ( tagName ) || ! ! unarySlash
231+
232+ const l = match . attrs . length
233+ const attrs = new Array ( l )
234+ for ( let i = 0 ; i < l ; i ++ ) {
235+ const args = match . attrs [ i ]
236+ // hackish work around FF bug https://bugzilla.mozilla.org/show_bug.cgi?id=369778
237+ if ( IS_REGEX_CAPTURING_BROKEN && args [ 0 ] . indexOf ( '""' ) === - 1 ) {
238+ if ( args [ 3 ] === '' ) { delete args [ 3 ] }
239+ if ( args [ 4 ] === '' ) { delete args [ 4 ] }
240+ if ( args [ 5 ] === '' ) { delete args [ 5 ] }
241+ }
242+ const value = args [ 3 ] || args [ 4 ] || args [ 5 ] || ''
243+ const shouldDecodeNewlines = tagName === 'a' && args [ 1 ] === 'href'
244+ ? options . shouldDecodeNewlinesForHref
245+ : options . shouldDecodeNewlines
246+ attrs [ i ] = {
247+ name : args [ 1 ] ,
248+ value : decodeAttr ( value , shouldDecodeNewlines )
249+ }
250+ }
251+
252+ if ( ! unary ) {
253+ stack . push ( { tag : tagName , lowerCasedTag : tagName . toLowerCase ( ) , attrs : attrs } )
254+ lastTag = tagName
255+ }
256+
257+ if ( options . start ) {
258+ options . start ( tagName , attrs , unary , match . start , match . end )
259+ }
260+ }
261+
262+ function parseEndTag ( tagName , start , end ) {
263+ let pos , lowerCasedTagName
264+ if ( start == null ) start = index
265+ if ( end == null ) end = index
266+
267+ if ( tagName ) {
268+ lowerCasedTagName = tagName . toLowerCase ( )
269+ }
270+
271+ // Find the closest opened tag of the same type
272+ if ( tagName ) {
273+ for ( pos = stack . length - 1 ; pos >= 0 ; pos -- ) {
274+ if ( stack [ pos ] . lowerCasedTag === lowerCasedTagName ) {
275+ break
276+ }
277+ }
278+ } else {
279+ // If no tag name is provided, clean shop
280+ pos = 0
281+ }
282+
283+ if ( pos >= 0 ) {
284+ // Close all the open elements, up the stack
285+ for ( let i = stack . length - 1 ; i >= pos ; i -- ) {
286+ if ( process . env . NODE_ENV !== 'production' &&
287+ ( i > pos || ! tagName ) &&
288+ options . warn
289+ ) {
290+ options . warn (
291+ `tag <${ stack [ i ] . tag } > has no matching end tag.`
292+ )
293+ }
294+ if ( options . end ) {
295+ options . end ( stack [ i ] . tag , start , end )
296+ }
297+ }
298+
299+ // Remove the open elements from the stack
300+ stack . length = pos
301+ lastTag = pos && stack [ pos - 1 ] . tag
302+ } else if ( lowerCasedTagName === 'br' ) {
303+ if ( options . start ) {
304+ options . start ( tagName , [ ] , true , start , end )
305+ }
306+ } else if ( lowerCasedTagName === 'p' ) {
307+ if ( options . start ) {
308+ options . start ( tagName , [ ] , false , start , end )
309+ }
310+ if ( options . end ) {
311+ options . end ( tagName , start , end )
312+ }
313+ }
314+ }
315+ }
0 commit comments