html-parser.ts 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. /**
  2. * Not type-checking this file because it's mostly vendor code.
  3. */
  4. /*!
  5. * HTML Parser By John Resig (ejohn.org)
  6. * Modified by Juriy "kangax" Zaytsev
  7. * Original code by Erik Arvidsson (MPL-1.1 OR Apache-2.0 OR GPL-2.0-or-later)
  8. * http://erik.eae.net/simplehtmlparser/simplehtmlparser.js
  9. */
  10. import { makeMap, no } from 'shared/util'
  11. import { isNonPhrasingTag } from 'web/compiler/util'
  12. import { unicodeRegExp } from 'core/util/lang'
  13. import { ASTAttr, CompilerOptions } from 'types/compiler'
  14. // Regular Expressions for parsing tags and attributes
  15. const attribute =
  16. /^\s*([^\s"'<>\/=]+)(?:\s*(=)\s*(?:"([^"]*)"+|'([^']*)'+|([^\s"'=<>`]+)))?/
  17. const dynamicArgAttribute =
  18. /^\s*((?:v-[\w-]+:|@|:|#)\[[^=]+?\][^\s"'<>\/=]*)(?:\s*(=)\s*(?:"([^"]*)"+|'([^']*)'+|([^\s"'=<>`]+)))?/
  19. const ncname = `[a-zA-Z_][\\-\\.0-9_a-zA-Z${unicodeRegExp.source}]*`
  20. const qnameCapture = `((?:${ncname}\\:)?${ncname})`
  21. const startTagOpen = new RegExp(`^<${qnameCapture}`)
  22. const startTagClose = /^\s*(\/?)>/
  23. const endTag = new RegExp(`^<\\/${qnameCapture}[^>]*>`)
  24. const doctype = /^<!DOCTYPE [^>]+>/i
  25. // #7298: escape - to avoid being passed as HTML comment when inlined in page
  26. const comment = /^<!\--/
  27. const conditionalComment = /^<!\[/
  28. // Special Elements (can contain anything)
  29. export const isPlainTextElement = makeMap('script,style,textarea', true)
  30. const reCache = {}
  31. const decodingMap = {
  32. '&lt;': '<',
  33. '&gt;': '>',
  34. '&quot;': '"',
  35. '&amp;': '&',
  36. '&#10;': '\n',
  37. '&#9;': '\t',
  38. '&#39;': "'"
  39. }
  40. const encodedAttr = /&(?:lt|gt|quot|amp|#39);/g
  41. const encodedAttrWithNewLines = /&(?:lt|gt|quot|amp|#39|#10|#9);/g
  42. // #5992
  43. const isIgnoreNewlineTag = makeMap('pre,textarea', true)
  44. const shouldIgnoreFirstNewline = (tag, html) =>
  45. tag && isIgnoreNewlineTag(tag) && html[0] === '\n'
  46. function decodeAttr(value, shouldDecodeNewlines) {
  47. const re = shouldDecodeNewlines ? encodedAttrWithNewLines : encodedAttr
  48. return value.replace(re, match => decodingMap[match])
  49. }
  50. export interface HTMLParserOptions extends CompilerOptions {
  51. start?: (
  52. tag: string,
  53. attrs: ASTAttr[],
  54. unary: boolean,
  55. start: number,
  56. end: number
  57. ) => void
  58. end?: (tag: string, start: number, end: number) => void
  59. chars?: (text: string, start?: number, end?: number) => void
  60. comment?: (content: string, start: number, end: number) => void
  61. }
  62. export function parseHTML(html, options: HTMLParserOptions) {
  63. const stack: any[] = []
  64. const expectHTML = options.expectHTML
  65. const isUnaryTag = options.isUnaryTag || no
  66. const canBeLeftOpenTag = options.canBeLeftOpenTag || no
  67. let index = 0
  68. let last, lastTag
  69. while (html) {
  70. last = html
  71. // Make sure we're not in a plaintext content element like script/style
  72. if (!lastTag || !isPlainTextElement(lastTag)) {
  73. let textEnd = html.indexOf('<')
  74. if (textEnd === 0) {
  75. // Comment:
  76. if (comment.test(html)) {
  77. const commentEnd = html.indexOf('-->')
  78. if (commentEnd >= 0) {
  79. if (options.shouldKeepComment && options.comment) {
  80. options.comment(
  81. html.substring(4, commentEnd),
  82. index,
  83. index + commentEnd + 3
  84. )
  85. }
  86. advance(commentEnd + 3)
  87. continue
  88. }
  89. }
  90. // https://en.wikipedia.org/wiki/Conditional_comment#Downlevel-revealed_conditional_comment
  91. if (conditionalComment.test(html)) {
  92. const conditionalEnd = html.indexOf(']>')
  93. if (conditionalEnd >= 0) {
  94. advance(conditionalEnd + 2)
  95. continue
  96. }
  97. }
  98. // Doctype:
  99. const doctypeMatch = html.match(doctype)
  100. if (doctypeMatch) {
  101. advance(doctypeMatch[0].length)
  102. continue
  103. }
  104. // End tag:
  105. const endTagMatch = html.match(endTag)
  106. if (endTagMatch) {
  107. const curIndex = index
  108. advance(endTagMatch[0].length)
  109. parseEndTag(endTagMatch[1], curIndex, index)
  110. continue
  111. }
  112. // Start tag:
  113. const startTagMatch = parseStartTag()
  114. if (startTagMatch) {
  115. handleStartTag(startTagMatch)
  116. if (shouldIgnoreFirstNewline(startTagMatch.tagName, html)) {
  117. advance(1)
  118. }
  119. continue
  120. }
  121. }
  122. let text, rest, next
  123. if (textEnd >= 0) {
  124. rest = html.slice(textEnd)
  125. while (
  126. !endTag.test(rest) &&
  127. !startTagOpen.test(rest) &&
  128. !comment.test(rest) &&
  129. !conditionalComment.test(rest)
  130. ) {
  131. // < in plain text, be forgiving and treat it as text
  132. next = rest.indexOf('<', 1)
  133. if (next < 0) break
  134. textEnd += next
  135. rest = html.slice(textEnd)
  136. }
  137. text = html.substring(0, textEnd)
  138. }
  139. if (textEnd < 0) {
  140. text = html
  141. }
  142. if (text) {
  143. advance(text.length)
  144. }
  145. if (options.chars && text) {
  146. options.chars(text, index - text.length, index)
  147. }
  148. } else {
  149. let endTagLength = 0
  150. const stackedTag = lastTag.toLowerCase()
  151. const reStackedTag =
  152. reCache[stackedTag] ||
  153. (reCache[stackedTag] = new RegExp(
  154. '([\\s\\S]*?)(</' + stackedTag + '[^>]*>)',
  155. 'i'
  156. ))
  157. const rest = html.replace(reStackedTag, function (all, text, endTag) {
  158. endTagLength = endTag.length
  159. if (!isPlainTextElement(stackedTag) && stackedTag !== 'noscript') {
  160. text = text
  161. .replace(/<!\--([\s\S]*?)-->/g, '$1') // #7298
  162. .replace(/<!\[CDATA\[([\s\S]*?)]]>/g, '$1')
  163. }
  164. if (shouldIgnoreFirstNewline(stackedTag, text)) {
  165. text = text.slice(1)
  166. }
  167. if (options.chars) {
  168. options.chars(text)
  169. }
  170. return ''
  171. })
  172. index += html.length - rest.length
  173. html = rest
  174. parseEndTag(stackedTag, index - endTagLength, index)
  175. }
  176. if (html === last) {
  177. options.chars && options.chars(html)
  178. if (__DEV__ && !stack.length && options.warn) {
  179. options.warn(`Mal-formatted tag at end of template: "${html}"`, {
  180. start: index + html.length
  181. })
  182. }
  183. break
  184. }
  185. }
  186. // Clean up any remaining tags
  187. parseEndTag()
  188. function advance(n) {
  189. index += n
  190. html = html.substring(n)
  191. }
  192. function parseStartTag() {
  193. const start = html.match(startTagOpen)
  194. if (start) {
  195. const match: any = {
  196. tagName: start[1],
  197. attrs: [],
  198. start: index
  199. }
  200. advance(start[0].length)
  201. let end, attr
  202. while (
  203. !(end = html.match(startTagClose)) &&
  204. (attr = html.match(dynamicArgAttribute) || html.match(attribute))
  205. ) {
  206. attr.start = index
  207. advance(attr[0].length)
  208. attr.end = index
  209. match.attrs.push(attr)
  210. }
  211. if (end) {
  212. match.unarySlash = end[1]
  213. advance(end[0].length)
  214. match.end = index
  215. return match
  216. }
  217. }
  218. }
  219. function handleStartTag(match) {
  220. const tagName = match.tagName
  221. const unarySlash = match.unarySlash
  222. if (expectHTML) {
  223. if (lastTag === 'p' && isNonPhrasingTag(tagName)) {
  224. parseEndTag(lastTag)
  225. }
  226. if (canBeLeftOpenTag(tagName) && lastTag === tagName) {
  227. parseEndTag(tagName)
  228. }
  229. }
  230. const unary = isUnaryTag(tagName) || !!unarySlash
  231. const l = match.attrs.length
  232. const attrs: ASTAttr[] = new Array(l)
  233. for (let i = 0; i < l; i++) {
  234. const args = match.attrs[i]
  235. const value = args[3] || args[4] || args[5] || ''
  236. const shouldDecodeNewlines =
  237. tagName === 'a' && args[1] === 'href'
  238. ? options.shouldDecodeNewlinesForHref
  239. : options.shouldDecodeNewlines
  240. attrs[i] = {
  241. name: args[1],
  242. value: decodeAttr(value, shouldDecodeNewlines)
  243. }
  244. if (__DEV__ && options.outputSourceRange) {
  245. attrs[i].start = args.start + args[0].match(/^\s*/).length
  246. attrs[i].end = args.end
  247. }
  248. }
  249. if (!unary) {
  250. stack.push({
  251. tag: tagName,
  252. lowerCasedTag: tagName.toLowerCase(),
  253. attrs: attrs,
  254. start: match.start,
  255. end: match.end
  256. })
  257. lastTag = tagName
  258. }
  259. if (options.start) {
  260. options.start(tagName, attrs, unary, match.start, match.end)
  261. }
  262. }
  263. function parseEndTag(tagName?: any, start?: any, end?: any) {
  264. let pos, lowerCasedTagName
  265. if (start == null) start = index
  266. if (end == null) end = index
  267. // Find the closest opened tag of the same type
  268. if (tagName) {
  269. lowerCasedTagName = tagName.toLowerCase()
  270. for (pos = stack.length - 1; pos >= 0; pos--) {
  271. if (stack[pos].lowerCasedTag === lowerCasedTagName) {
  272. break
  273. }
  274. }
  275. } else {
  276. // If no tag name is provided, clean shop
  277. pos = 0
  278. }
  279. if (pos >= 0) {
  280. // Close all the open elements, up the stack
  281. for (let i = stack.length - 1; i >= pos; i--) {
  282. if (__DEV__ && (i > pos || !tagName) && options.warn) {
  283. options.warn(`tag <${stack[i].tag}> has no matching end tag.`, {
  284. start: stack[i].start,
  285. end: stack[i].end
  286. })
  287. }
  288. if (options.end) {
  289. options.end(stack[i].tag, start, end)
  290. }
  291. }
  292. // Remove the open elements from the stack
  293. stack.length = pos
  294. lastTag = pos && stack[pos - 1].tag
  295. } else if (lowerCasedTagName === 'br') {
  296. if (options.start) {
  297. options.start(tagName, [], true, start, end)
  298. }
  299. } else if (lowerCasedTagName === 'p') {
  300. if (options.start) {
  301. options.start(tagName, [], false, start, end)
  302. }
  303. if (options.end) {
  304. options.end(tagName, start, end)
  305. }
  306. }
  307. }
  308. }