html-parser.js 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274
  1. /**
  2. * Not type-checking this file because it's mostly vendor code.
  3. */
  4. /*!
  5. * HTML Parser By John Resig (ejohn.org)
  6. * Modified by Juriy "kangax" Zaytsev
  7. * Original code by Erik Arvidsson, Mozilla Public License
  8. * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js
  9. */
  10. import { makeMap, no } from 'shared/util'
  11. import { isNonPhrasingTag, canBeLeftOpenTag } from 'web/util/index'
  12. // Regular Expressions for parsing tags and attributes
  13. const singleAttrIdentifier = /([^\s"'<>\/=]+)/
  14. const singleAttrAssign = /(?:=)/
  15. const singleAttrValues = [
  16. // attr value double quotes
  17. /"([^"]*)"+/.source,
  18. // attr value, single quotes
  19. /'([^']*)'+/.source,
  20. // attr value, no quotes
  21. /([^\s"'=<>`]+)/.source
  22. ]
  23. const attribute = new RegExp(
  24. '^\\s*' + singleAttrIdentifier.source +
  25. '(?:\\s*(' + singleAttrAssign.source + ')' +
  26. '\\s*(?:' + singleAttrValues.join('|') + '))?'
  27. )
  28. // could use https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-QName
  29. // but for Vue templates we can enforce a simple charset
  30. const ncname = '[a-zA-Z_][\\w\\-\\.]*'
  31. const qnameCapture = '((?:' + ncname + '\\:)?' + ncname + ')'
  32. const startTagOpen = new RegExp('^<' + qnameCapture)
  33. const startTagClose = /^\s*(\/?)>/
  34. const endTag = new RegExp('^<\\/' + qnameCapture + '[^>]*>')
  35. const doctype = /^<!DOCTYPE [^>]+>/i
  36. let IS_REGEX_CAPTURING_BROKEN = false
  37. 'x'.replace(/x(.)?/g, function (m, g) {
  38. IS_REGEX_CAPTURING_BROKEN = g === ''
  39. })
  40. // Special Elements (can contain anything)
  41. const isSpecialTag = makeMap('script,style', true)
  42. const reCache = {}
  43. const ampRE = /&amp;/g
  44. const ltRE = /&lt;/g
  45. const gtRE = /&gt;/g
  46. function decodeAttr (value, shouldDecodeTags) {
  47. if (shouldDecodeTags) {
  48. value = value.replace(ltRE, '<').replace(gtRE, '>')
  49. }
  50. return value.replace(ampRE, '&')
  51. }
  52. export function parseHTML (html, options) {
  53. const stack = []
  54. const expectHTML = options.expectHTML
  55. const isUnaryTag = options.isUnaryTag || no
  56. const isFromDOM = options.isFromDOM
  57. const shouldDecodeTags = options.shouldDecodeTags
  58. let index = 0
  59. let last, lastTag
  60. while (html) {
  61. last = html
  62. // Make sure we're not in a script or style element
  63. if (!lastTag || !isSpecialTag(lastTag)) {
  64. const textEnd = html.indexOf('<')
  65. if (textEnd === 0) {
  66. // Comment:
  67. if (/^<!--/.test(html)) {
  68. const commentEnd = html.indexOf('-->')
  69. if (commentEnd >= 0) {
  70. advance(commentEnd + 3)
  71. continue
  72. }
  73. }
  74. // http://en.wikipedia.org/wiki/Conditional_comment#Downlevel-revealed_conditional_comment
  75. if (/^<!\[/.test(html)) {
  76. const conditionalEnd = html.indexOf(']>')
  77. if (conditionalEnd >= 0) {
  78. advance(conditionalEnd + 2)
  79. continue
  80. }
  81. }
  82. // Doctype:
  83. const doctypeMatch = html.match(doctype)
  84. if (doctypeMatch) {
  85. advance(doctypeMatch[0].length)
  86. continue
  87. }
  88. // End tag:
  89. const endTagMatch = html.match(endTag)
  90. if (endTagMatch) {
  91. const curIndex = index
  92. advance(endTagMatch[0].length)
  93. parseEndTag(endTagMatch[0], endTagMatch[1], curIndex, index)
  94. continue
  95. }
  96. // Start tag:
  97. const startTagMatch = parseStartTag()
  98. if (startTagMatch) {
  99. handleStartTag(startTagMatch)
  100. continue
  101. }
  102. }
  103. let text
  104. if (textEnd >= 0) {
  105. text = html.substring(0, textEnd)
  106. advance(textEnd)
  107. } else {
  108. text = html
  109. html = ''
  110. }
  111. if (options.chars) {
  112. options.chars(text)
  113. }
  114. } else {
  115. const stackedTag = lastTag.toLowerCase()
  116. const reStackedTag = reCache[stackedTag] || (reCache[stackedTag] = new RegExp('([\\s\\S]*?)(</' + stackedTag + '[^>]*>)', 'i'))
  117. let endTagLength = 0
  118. const rest = html.replace(reStackedTag, function (all, text, endTag) {
  119. endTagLength = endTag.length
  120. if (stackedTag !== 'script' && stackedTag !== 'style' && stackedTag !== 'noscript') {
  121. text = text
  122. .replace(/<!--([\s\S]*?)-->/g, '$1')
  123. .replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, '$1')
  124. }
  125. if (options.chars) {
  126. options.chars(text)
  127. }
  128. return ''
  129. })
  130. index += html.length - rest.length
  131. html = rest
  132. parseEndTag('</' + stackedTag + '>', stackedTag, index - endTagLength, index)
  133. }
  134. if (html === last) {
  135. throw new Error('Error parsing template:\n\n' + html)
  136. }
  137. }
  138. // Clean up any remaining tags
  139. parseEndTag()
  140. function advance (n) {
  141. index += n
  142. html = html.substring(n)
  143. }
  144. function parseStartTag () {
  145. const start = html.match(startTagOpen)
  146. if (start) {
  147. const match = {
  148. tagName: start[1],
  149. attrs: [],
  150. start: index
  151. }
  152. advance(start[0].length)
  153. let end, attr
  154. while (!(end = html.match(startTagClose)) && (attr = html.match(attribute))) {
  155. advance(attr[0].length)
  156. match.attrs.push(attr)
  157. }
  158. if (end) {
  159. match.unarySlash = end[1]
  160. advance(end[0].length)
  161. match.end = index
  162. return match
  163. }
  164. }
  165. }
  166. function handleStartTag (match) {
  167. const tagName = match.tagName
  168. let unarySlash = match.unarySlash
  169. if (expectHTML) {
  170. if (lastTag === 'p' && isNonPhrasingTag(tagName)) {
  171. parseEndTag('', lastTag)
  172. }
  173. if (canBeLeftOpenTag(tagName) && lastTag === tagName) {
  174. parseEndTag('', tagName)
  175. }
  176. }
  177. const unary = isUnaryTag(tagName) || tagName === 'html' && lastTag === 'head' || !!unarySlash
  178. const l = match.attrs.length
  179. const attrs = new Array(l)
  180. for (let i = 0; i < l; i++) {
  181. const args = match.attrs[i]
  182. // hackish work around FF bug https://bugzilla.mozilla.org/show_bug.cgi?id=369778
  183. if (IS_REGEX_CAPTURING_BROKEN && args[0].indexOf('""') === -1) {
  184. if (args[3] === '') { delete args[3] }
  185. if (args[4] === '') { delete args[4] }
  186. if (args[5] === '') { delete args[5] }
  187. }
  188. const value = args[3] || args[4] || args[5] || ''
  189. attrs[i] = {
  190. name: args[1],
  191. value: isFromDOM ? decodeAttr(value, shouldDecodeTags) : value
  192. }
  193. }
  194. if (!unary) {
  195. stack.push({ tag: tagName, attrs: attrs })
  196. lastTag = tagName
  197. unarySlash = ''
  198. }
  199. if (options.start) {
  200. options.start(tagName, attrs, unary, match.start, match.end)
  201. }
  202. }
  203. function parseEndTag (tag, tagName, start, end) {
  204. let pos
  205. if (start == null) start = index
  206. if (end == null) end = index
  207. // Find the closest opened tag of the same type
  208. if (tagName) {
  209. const needle = tagName.toLowerCase()
  210. for (pos = stack.length - 1; pos >= 0; pos--) {
  211. if (stack[pos].tag.toLowerCase() === needle) {
  212. break
  213. }
  214. }
  215. } else {
  216. // If no tag name is provided, clean shop
  217. pos = 0
  218. }
  219. if (pos >= 0) {
  220. // Close all the open elements, up the stack
  221. for (let i = stack.length - 1; i >= pos; i--) {
  222. if (options.end) {
  223. options.end(stack[i].tag, start, end)
  224. }
  225. }
  226. // Remove the open elements from the stack
  227. stack.length = pos
  228. lastTag = pos && stack[pos - 1].tag
  229. } else if (tagName.toLowerCase() === 'br') {
  230. if (options.start) {
  231. options.start(tagName, [], true, start, end)
  232. }
  233. } else if (tagName.toLowerCase() === 'p') {
  234. if (options.start) {
  235. options.start(tagName, [], false, start, end)
  236. }
  237. if (options.end) {
  238. options.end(tagName, start, end)
  239. }
  240. }
  241. }
  242. }