html-parser.js 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299
  1. /**
  2. * Not type-checking this file because it's mostly vendor code.
  3. */
  4. /*!
  5. * HTML Parser By John Resig (ejohn.org)
  6. * Modified by Juriy "kangax" Zaytsev
  7. * Original code by Erik Arvidsson, Mozilla Public License
  8. * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js
  9. */
  10. import { makeMap, no } from 'shared/util'
  11. import { isNonPhrasingTag } from 'web/compiler/util'
  12. // Regular Expressions for parsing tags and attributes
  13. const attribute = /^\s*([^\s"'<>\/=]+)(?:\s*(=)\s*(?:"([^"]*)"+|'([^']*)'+|([^\s"'=<>`]+)))?/
  14. // could use https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-QName
  15. // but for Vue templates we can enforce a simple charset
  16. const ncname = '[a-zA-Z_][\\w\\-\\.]*'
  17. const qnameCapture = `((?:${ncname}\\:)?${ncname})`
  18. const startTagOpen = new RegExp(`^<${qnameCapture}`)
  19. const startTagClose = /^\s*(\/?)>/
  20. const endTag = new RegExp(`^<\\/${qnameCapture}[^>]*>`)
  21. const doctype = /^<!DOCTYPE [^>]+>/i
  22. // #7298: escape - to avoid being pased as HTML comment when inlined in page
  23. const comment = /^<!\--/
  24. const conditionalComment = /^<!\[/
  25. // Special Elements (can contain anything)
  26. export const isPlainTextElement = makeMap('script,style,textarea', true)
  27. const reCache = {}
  28. const decodingMap = {
  29. '&lt;': '<',
  30. '&gt;': '>',
  31. '&quot;': '"',
  32. '&amp;': '&',
  33. '&#10;': '\n',
  34. '&#9;': '\t'
  35. }
  36. const encodedAttr = /&(?:lt|gt|quot|amp);/g
  37. const encodedAttrWithNewLines = /&(?:lt|gt|quot|amp|#10|#9);/g
  38. // #5992
  39. const isIgnoreNewlineTag = makeMap('pre,textarea', true)
  40. const shouldIgnoreFirstNewline = (tag, html) => tag && isIgnoreNewlineTag(tag) && html[0] === '\n'
  41. function decodeAttr (value, shouldDecodeNewlines) {
  42. const re = shouldDecodeNewlines ? encodedAttrWithNewLines : encodedAttr
  43. return value.replace(re, match => decodingMap[match])
  44. }
  45. export function parseHTML (html, options) {
  46. const stack = []
  47. const expectHTML = options.expectHTML
  48. const isUnaryTag = options.isUnaryTag || no
  49. const canBeLeftOpenTag = options.canBeLeftOpenTag || no
  50. let index = 0
  51. let last, lastTag
  52. while (html) {
  53. last = html
  54. // Make sure we're not in a plaintext content element like script/style
  55. if (!lastTag || !isPlainTextElement(lastTag)) {
  56. let textEnd = html.indexOf('<')
  57. if (textEnd === 0) {
  58. // Comment:
  59. if (comment.test(html)) {
  60. const commentEnd = html.indexOf('-->')
  61. if (commentEnd >= 0) {
  62. if (options.shouldKeepComment) {
  63. options.comment(html.substring(4, commentEnd))
  64. }
  65. advance(commentEnd + 3)
  66. continue
  67. }
  68. }
  69. // http://en.wikipedia.org/wiki/Conditional_comment#Downlevel-revealed_conditional_comment
  70. if (conditionalComment.test(html)) {
  71. const conditionalEnd = html.indexOf(']>')
  72. if (conditionalEnd >= 0) {
  73. advance(conditionalEnd + 2)
  74. continue
  75. }
  76. }
  77. // Doctype:
  78. const doctypeMatch = html.match(doctype)
  79. if (doctypeMatch) {
  80. advance(doctypeMatch[0].length)
  81. continue
  82. }
  83. // End tag:
  84. const endTagMatch = html.match(endTag)
  85. if (endTagMatch) {
  86. const curIndex = index
  87. advance(endTagMatch[0].length)
  88. parseEndTag(endTagMatch[1], curIndex, index)
  89. continue
  90. }
  91. // Start tag:
  92. const startTagMatch = parseStartTag()
  93. if (startTagMatch) {
  94. handleStartTag(startTagMatch)
  95. if (shouldIgnoreFirstNewline(lastTag, html)) {
  96. advance(1)
  97. }
  98. continue
  99. }
  100. }
  101. let text, rest, next
  102. if (textEnd >= 0) {
  103. rest = html.slice(textEnd)
  104. while (
  105. !endTag.test(rest) &&
  106. !startTagOpen.test(rest) &&
  107. !comment.test(rest) &&
  108. !conditionalComment.test(rest)
  109. ) {
  110. // < in plain text, be forgiving and treat it as text
  111. next = rest.indexOf('<', 1)
  112. if (next < 0) break
  113. textEnd += next
  114. rest = html.slice(textEnd)
  115. }
  116. text = html.substring(0, textEnd)
  117. advance(textEnd)
  118. }
  119. if (textEnd < 0) {
  120. text = html
  121. html = ''
  122. }
  123. if (options.chars && text) {
  124. options.chars(text)
  125. }
  126. } else {
  127. let endTagLength = 0
  128. const stackedTag = lastTag.toLowerCase()
  129. const reStackedTag = reCache[stackedTag] || (reCache[stackedTag] = new RegExp('([\\s\\S]*?)(</' + stackedTag + '[^>]*>)', 'i'))
  130. const rest = html.replace(reStackedTag, function (all, text, endTag) {
  131. endTagLength = endTag.length
  132. if (!isPlainTextElement(stackedTag) && stackedTag !== 'noscript') {
  133. text = text
  134. .replace(/<!\--([\s\S]*?)-->/g, '$1') // #7298
  135. .replace(/<!\[CDATA\[([\s\S]*?)]]>/g, '$1')
  136. }
  137. if (shouldIgnoreFirstNewline(stackedTag, text)) {
  138. text = text.slice(1)
  139. }
  140. if (options.chars) {
  141. options.chars(text)
  142. }
  143. return ''
  144. })
  145. index += html.length - rest.length
  146. html = rest
  147. parseEndTag(stackedTag, index - endTagLength, index)
  148. }
  149. if (html === last) {
  150. options.chars && options.chars(html)
  151. if (process.env.NODE_ENV !== 'production' && !stack.length && options.warn) {
  152. options.warn(`Mal-formatted tag at end of template: "${html}"`)
  153. }
  154. break
  155. }
  156. }
  157. // Clean up any remaining tags
  158. parseEndTag()
  159. function advance (n) {
  160. index += n
  161. html = html.substring(n)
  162. }
  163. function parseStartTag () {
  164. const start = html.match(startTagOpen)
  165. if (start) {
  166. const match = {
  167. tagName: start[1],
  168. attrs: [],
  169. start: index
  170. }
  171. advance(start[0].length)
  172. let end, attr
  173. while (!(end = html.match(startTagClose)) && (attr = html.match(attribute))) {
  174. advance(attr[0].length)
  175. match.attrs.push(attr)
  176. }
  177. if (end) {
  178. match.unarySlash = end[1]
  179. advance(end[0].length)
  180. match.end = index
  181. return match
  182. }
  183. }
  184. }
  185. function handleStartTag (match) {
  186. const tagName = match.tagName
  187. const unarySlash = match.unarySlash
  188. if (expectHTML) {
  189. if (lastTag === 'p' && isNonPhrasingTag(tagName)) {
  190. parseEndTag(lastTag)
  191. }
  192. if (canBeLeftOpenTag(tagName) && lastTag === tagName) {
  193. parseEndTag(tagName)
  194. }
  195. }
  196. const unary = isUnaryTag(tagName) || !!unarySlash
  197. const l = match.attrs.length
  198. const attrs = new Array(l)
  199. for (let i = 0; i < l; i++) {
  200. const args = match.attrs[i]
  201. const value = args[3] || args[4] || args[5] || ''
  202. const shouldDecodeNewlines = tagName === 'a' && args[1] === 'href'
  203. ? options.shouldDecodeNewlinesForHref
  204. : options.shouldDecodeNewlines
  205. attrs[i] = {
  206. name: args[1],
  207. value: decodeAttr(value, shouldDecodeNewlines)
  208. }
  209. }
  210. if (!unary) {
  211. stack.push({ tag: tagName, lowerCasedTag: tagName.toLowerCase(), attrs: attrs })
  212. lastTag = tagName
  213. }
  214. if (options.start) {
  215. options.start(tagName, attrs, unary, match.start, match.end)
  216. }
  217. }
  218. function parseEndTag (tagName, start, end) {
  219. let pos, lowerCasedTagName
  220. if (start == null) start = index
  221. if (end == null) end = index
  222. if (tagName) {
  223. lowerCasedTagName = tagName.toLowerCase()
  224. }
  225. // Find the closest opened tag of the same type
  226. if (tagName) {
  227. for (pos = stack.length - 1; pos >= 0; pos--) {
  228. if (stack[pos].lowerCasedTag === lowerCasedTagName) {
  229. break
  230. }
  231. }
  232. } else {
  233. // If no tag name is provided, clean shop
  234. pos = 0
  235. }
  236. if (pos >= 0) {
  237. // Close all the open elements, up the stack
  238. for (let i = stack.length - 1; i >= pos; i--) {
  239. if (process.env.NODE_ENV !== 'production' &&
  240. (i > pos || !tagName) &&
  241. options.warn
  242. ) {
  243. options.warn(
  244. `tag <${stack[i].tag}> has no matching end tag.`
  245. )
  246. }
  247. if (options.end) {
  248. options.end(stack[i].tag, start, end)
  249. }
  250. }
  251. // Remove the open elements from the stack
  252. stack.length = pos
  253. lastTag = pos && stack[pos - 1].tag
  254. } else if (lowerCasedTagName === 'br') {
  255. if (options.start) {
  256. options.start(tagName, [], true, start, end)
  257. }
  258. } else if (lowerCasedTagName === 'p') {
  259. if (options.start) {
  260. options.start(tagName, [], false, start, end)
  261. }
  262. if (options.end) {
  263. options.end(tagName, start, end)
  264. }
  265. }
  266. }
  267. }