html-parser.js 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. /**
  2. * Not type-checking this file because it's mostly vendor code.
  3. */
  4. /*!
  5. * HTML Parser By John Resig (ejohn.org)
  6. * Modified by Juriy "kangax" Zaytsev
  7. * Original code by Erik Arvidsson, Mozilla Public License
  8. * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js
  9. */
  10. import { decodeHTML } from 'entities'
  11. import { makeMap, no } from 'shared/util'
  12. import { isNonPhrasingTag, canBeLeftOpenTag } from 'web/util/index'
  13. // Regular Expressions for parsing tags and attributes
  14. const singleAttrIdentifier = /([^\s"'<>\/=]+)/
  15. const singleAttrAssign = /=/
  16. const singleAttrAssigns = [singleAttrAssign]
  17. const singleAttrValues = [
  18. // attr value double quotes
  19. /"([^"]*)"+/.source,
  20. // attr value, single quotes
  21. /'([^']*)'+/.source,
  22. // attr value, no quotes
  23. /([^\s"'=<>`]+)/.source
  24. ]
  25. // could use https://www.w3.org/TR/1999/REC-xml-names-19990114/#NT-QName
  26. // but for Vue templates we can enforce a simple charset
  27. const ncname = '[a-zA-Z_][\\w\\-\\.]*'
  28. const qnameCapture = '((?:' + ncname + '\\:)?' + ncname + ')'
  29. const startTagOpen = new RegExp('^<' + qnameCapture)
  30. const startTagClose = /^\s*(\/?)>/
  31. const endTag = new RegExp('^<\\/' + qnameCapture + '[^>]*>')
  32. const doctype = /^<!DOCTYPE [^>]+>/i
  33. let IS_REGEX_CAPTURING_BROKEN = false
  34. 'x'.replace(/x(.)?/g, function (m, g) {
  35. IS_REGEX_CAPTURING_BROKEN = g === ''
  36. })
  37. // Special Elements (can contain anything)
  38. const special = makeMap('script,style', true)
  39. const reCache = {}
  40. function attrForHandler (handler) {
  41. const pattern = singleAttrIdentifier.source +
  42. '(?:\\s*(' + joinSingleAttrAssigns(handler) + ')' +
  43. '\\s*(?:' + singleAttrValues.join('|') + '))?'
  44. return new RegExp('^\\s*' + pattern)
  45. }
  46. function joinSingleAttrAssigns (handler) {
  47. return singleAttrAssigns.map(function (assign) {
  48. return '(?:' + assign.source + ')'
  49. }).join('|')
  50. }
  51. export function parseHTML (html, handler) {
  52. const stack = []
  53. const attribute = attrForHandler(handler)
  54. const expectHTML = handler.expectHTML
  55. const isUnaryTag = handler.isUnaryTag || no
  56. const isSpecialTag = handler.isSpecialTag || special
  57. let last, prevTag, nextTag, lastTag
  58. while (html) {
  59. last = html
  60. // Make sure we're not in a script or style element
  61. if (!lastTag || !isSpecialTag(lastTag)) {
  62. const textEnd = html.indexOf('<')
  63. if (textEnd === 0) {
  64. // Comment:
  65. if (/^<!--/.test(html)) {
  66. const commentEnd = html.indexOf('-->')
  67. if (commentEnd >= 0) {
  68. html = html.substring(commentEnd + 3)
  69. prevTag = ''
  70. continue
  71. }
  72. }
  73. // http://en.wikipedia.org/wiki/Conditional_comment#Downlevel-revealed_conditional_comment
  74. if (/^<!\[/.test(html)) {
  75. const conditionalEnd = html.indexOf(']>')
  76. if (conditionalEnd >= 0) {
  77. html = html.substring(conditionalEnd + 2)
  78. prevTag = ''
  79. continue
  80. }
  81. }
  82. // Doctype:
  83. const doctypeMatch = html.match(doctype)
  84. if (doctypeMatch) {
  85. if (handler.doctype) {
  86. handler.doctype(doctypeMatch[0])
  87. }
  88. html = html.substring(doctypeMatch[0].length)
  89. prevTag = ''
  90. continue
  91. }
  92. // End tag:
  93. const endTagMatch = html.match(endTag)
  94. if (endTagMatch) {
  95. html = html.substring(endTagMatch[0].length)
  96. endTagMatch[0].replace(endTag, parseEndTag)
  97. prevTag = '/' + endTagMatch[1].toLowerCase()
  98. continue
  99. }
  100. // Start tag:
  101. const startTagMatch = parseStartTag(html)
  102. if (startTagMatch) {
  103. html = startTagMatch.rest
  104. handleStartTag(startTagMatch)
  105. prevTag = startTagMatch.tagName.toLowerCase()
  106. continue
  107. }
  108. }
  109. let text
  110. if (textEnd >= 0) {
  111. text = html.substring(0, textEnd)
  112. html = html.substring(textEnd)
  113. } else {
  114. text = html
  115. html = ''
  116. }
  117. // next tag
  118. let nextTagMatch = parseStartTag(html)
  119. if (nextTagMatch) {
  120. nextTag = nextTagMatch.tagName
  121. } else {
  122. nextTagMatch = html.match(endTag)
  123. if (nextTagMatch) {
  124. nextTag = '/' + nextTagMatch[1]
  125. } else {
  126. nextTag = ''
  127. }
  128. }
  129. if (handler.chars) {
  130. handler.chars(text, prevTag, nextTag)
  131. }
  132. prevTag = ''
  133. } else {
  134. const stackedTag = lastTag.toLowerCase()
  135. const reStackedTag = reCache[stackedTag] || (reCache[stackedTag] = new RegExp('([\\s\\S]*?)</' + stackedTag + '[^>]*>', 'i'))
  136. html = html.replace(reStackedTag, function (all, text) {
  137. if (stackedTag !== 'script' && stackedTag !== 'style' && stackedTag !== 'noscript') {
  138. text = text
  139. .replace(/<!--([\s\S]*?)-->/g, '$1')
  140. .replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, '$1')
  141. }
  142. if (handler.chars) {
  143. handler.chars(text)
  144. }
  145. return ''
  146. })
  147. parseEndTag('</' + stackedTag + '>', stackedTag)
  148. }
  149. if (html === last) {
  150. throw new Error('Error parsing template:\n\n' + html)
  151. }
  152. }
  153. if (!handler.partialMarkup) {
  154. // Clean up any remaining tags
  155. parseEndTag()
  156. }
  157. function parseStartTag (input) {
  158. const start = input.match(startTagOpen)
  159. if (start) {
  160. const match = {
  161. tagName: start[1],
  162. attrs: []
  163. }
  164. input = input.slice(start[0].length)
  165. let end, attr
  166. while (!(end = input.match(startTagClose)) && (attr = input.match(attribute))) {
  167. input = input.slice(attr[0].length)
  168. match.attrs.push(attr)
  169. }
  170. if (end) {
  171. match.unarySlash = end[1]
  172. match.rest = input.slice(end[0].length)
  173. return match
  174. }
  175. }
  176. }
  177. function handleStartTag (match) {
  178. const tagName = match.tagName
  179. let unarySlash = match.unarySlash
  180. if (expectHTML) {
  181. if (lastTag === 'p' && isNonPhrasingTag(tagName)) {
  182. parseEndTag('', lastTag)
  183. }
  184. if (canBeLeftOpenTag(tagName) && lastTag === tagName) {
  185. parseEndTag('', tagName)
  186. }
  187. }
  188. const unary = isUnaryTag(tagName) || tagName === 'html' && lastTag === 'head' || !!unarySlash
  189. const l = match.attrs.length
  190. const attrs = new Array(l)
  191. for (let i = 0; i < l; i++) {
  192. const args = match.attrs[i]
  193. // hackish work around FF bug https://bugzilla.mozilla.org/show_bug.cgi?id=369778
  194. if (IS_REGEX_CAPTURING_BROKEN && args[0].indexOf('""') === -1) {
  195. if (args[3] === '') { delete args[3] }
  196. if (args[4] === '') { delete args[4] }
  197. if (args[5] === '') { delete args[5] }
  198. }
  199. attrs[i] = {
  200. name: args[1],
  201. value: decodeHTML(args[3] || args[4] || args[5] || '')
  202. }
  203. }
  204. if (!unary) {
  205. stack.push({ tag: tagName, attrs: attrs })
  206. lastTag = tagName
  207. unarySlash = ''
  208. }
  209. if (handler.start) {
  210. handler.start(tagName, attrs, unary, unarySlash)
  211. }
  212. }
  213. function parseEndTag (tag, tagName) {
  214. let pos
  215. // Find the closest opened tag of the same type
  216. if (tagName) {
  217. const needle = tagName.toLowerCase()
  218. for (pos = stack.length - 1; pos >= 0; pos--) {
  219. if (stack[pos].tag.toLowerCase() === needle) {
  220. break
  221. }
  222. }
  223. } else {
  224. // If no tag name is provided, clean shop
  225. pos = 0
  226. }
  227. if (pos >= 0) {
  228. // Close all the open elements, up the stack
  229. for (let i = stack.length - 1; i >= pos; i--) {
  230. if (handler.end) {
  231. handler.end(stack[i].tag, stack[i].attrs, i > pos || !tag)
  232. }
  233. }
  234. // Remove the open elements from the stack
  235. stack.length = pos
  236. lastTag = pos && stack[pos - 1].tag
  237. } else if (tagName.toLowerCase() === 'br') {
  238. if (handler.start) {
  239. handler.start(tagName, [], true, '')
  240. }
  241. } else if (tagName.toLowerCase() === 'p') {
  242. if (handler.start) {
  243. handler.start(tagName, [], false, '', true)
  244. }
  245. if (handler.end) {
  246. handler.end(tagName, [])
  247. }
  248. }
  249. }
  250. }