Tokenizer.ts 34 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112
  1. /**
  2. * This Tokenizer is adapted from htmlparser2 under the MIT License listed at
  3. * https://github.com/fb55/htmlparser2/blob/master/LICENSE
  4. Copyright 2010, 2011, Chris Winberry <chris@winberry.net>. All rights reserved.
  5. Permission is hereby granted, free of charge, to any person obtaining a copy
  6. of this software and associated documentation files (the "Software"), to
  7. deal in the Software without restriction, including without limitation the
  8. rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  9. sell copies of the Software, and to permit persons to whom the Software is
  10. furnished to do so, subject to the following conditions:
  11. The above copyright notice and this permission notice shall be included in
  12. all copies or substantial portions of the Software.
  13. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  18. FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  19. IN THE SOFTWARE.
  20. */
  21. import { ElementNode, Position } from '../ast'
  22. /**
  23. * Note: entities is a non-browser-build-only dependency.
  24. * In the browser, we use an HTML element to do the decoding.
  25. * Make sure all imports from entities are only used in non-browser branches
  26. * so that it can be properly treeshaken.
  27. */
  28. import {
  29. EntityDecoder,
  30. DecodingMode,
  31. htmlDecodeTree,
  32. fromCodePoint
  33. } from 'entities/lib/decode.js'
  34. export const enum ParseMode {
  35. BASE,
  36. HTML,
  37. SFC
  38. }
  39. export const enum CharCodes {
  40. Tab = 0x9, // "\t"
  41. NewLine = 0xa, // "\n"
  42. FormFeed = 0xc, // "\f"
  43. CarriageReturn = 0xd, // "\r"
  44. Space = 0x20, // " "
  45. ExclamationMark = 0x21, // "!"
  46. Number = 0x23, // "#"
  47. Amp = 0x26, // "&"
  48. SingleQuote = 0x27, // "'"
  49. DoubleQuote = 0x22, // '"'
  50. Dash = 0x2d, // "-"
  51. Slash = 0x2f, // "/"
  52. Zero = 0x30, // "0"
  53. Nine = 0x39, // "9"
  54. Semi = 0x3b, // ";"
  55. Lt = 0x3c, // "<"
  56. Eq = 0x3d, // "="
  57. Gt = 0x3e, // ">"
  58. Questionmark = 0x3f, // "?"
  59. UpperA = 0x41, // "A"
  60. LowerA = 0x61, // "a"
  61. UpperF = 0x46, // "F"
  62. LowerF = 0x66, // "f"
  63. UpperZ = 0x5a, // "Z"
  64. LowerZ = 0x7a, // "z"
  65. LowerX = 0x78, // "x"
  66. OpeningSquareBracket = 0x5b, // "["
  67. LowerV = 0x76, // "v"
  68. Dot = 0x2e, // "."
  69. Colon = 0x3a, // ":"
  70. At = 0x40, // "@"
  71. LeftSqaure = 91, // "["
  72. RightSquare = 93 // "]"
  73. }
  74. const defaultDelimitersOpen = new Uint8Array([123, 123]) // "{{"
  75. const defaultDelimitersClose = new Uint8Array([125, 125]) // "}}"
  76. /** All the states the tokenizer can be in. */
  77. const enum State {
  78. Text = 1,
  79. // interpolation
  80. InterpolationOpen,
  81. Interpolation,
  82. InterpolationClose,
  83. // Tags
  84. BeforeTagName, // After <
  85. InTagName,
  86. InSelfClosingTag,
  87. BeforeClosingTagName,
  88. InClosingTagName,
  89. AfterClosingTagName,
  90. // Attributes
  91. BeforeAttributeName,
  92. InAttributeName,
  93. InDirectiveName,
  94. InDirectiveArg,
  95. InDirectiveDynamicArg,
  96. InDirectiveModifier,
  97. AfterAttributeName,
  98. BeforeAttributeValue,
  99. InAttributeValueDq, // "
  100. InAttributeValueSq, // '
  101. InAttributeValueNq,
  102. // Declarations
  103. BeforeDeclaration, // !
  104. InDeclaration,
  105. // Processing instructions
  106. InProcessingInstruction, // ?
  107. // Comments & CDATA
  108. BeforeComment,
  109. CDATASequence,
  110. InSpecialComment,
  111. InCommentLike,
  112. // Special tags
  113. BeforeSpecialS, // Decide if we deal with `<script` or `<style`
  114. BeforeSpecialT, // Decide if we deal with `<title` or `<textarea`
  115. SpecialStartSequence,
  116. InSpecialTag,
  117. InEntity,
  118. InSFCRootTagName
  119. }
  120. /**
  121. * HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a
  122. * tag name.
  123. */
  124. function isTagStartChar(c: number): boolean {
  125. return (
  126. (c >= CharCodes.LowerA && c <= CharCodes.LowerZ) ||
  127. (c >= CharCodes.UpperA && c <= CharCodes.UpperZ)
  128. )
  129. }
  130. export function isWhitespace(c: number): boolean {
  131. return (
  132. c === CharCodes.Space ||
  133. c === CharCodes.NewLine ||
  134. c === CharCodes.Tab ||
  135. c === CharCodes.FormFeed ||
  136. c === CharCodes.CarriageReturn
  137. )
  138. }
  139. function isEndOfTagSection(c: number): boolean {
  140. return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c)
  141. }
  142. export function toCharCodes(str: string): Uint8Array {
  143. const ret = new Uint8Array(str.length)
  144. for (let i = 0; i < str.length; i++) {
  145. ret[i] = str.charCodeAt(i)
  146. }
  147. return ret
  148. }
  149. export enum QuoteType {
  150. NoValue = 0,
  151. Unquoted = 1,
  152. Single = 2,
  153. Double = 3
  154. }
  155. export interface Callbacks {
  156. ontext(start: number, endIndex: number): void
  157. ontextentity(char: string, start: number, endIndex: number): void
  158. oninterpolation(start: number, endIndex: number): void
  159. onopentagname(start: number, endIndex: number): void
  160. onopentagend(endIndex: number): void
  161. onselfclosingtag(endIndex: number): void
  162. onclosetag(start: number, endIndex: number): void
  163. onattribdata(start: number, endIndex: number): void
  164. onattribentity(char: string, start: number, end: number): void
  165. onattribend(quote: QuoteType, endIndex: number): void
  166. onattribname(start: number, endIndex: number): void
  167. onattribnameend(endIndex: number): void
  168. ondirname(start: number, endIndex: number): void
  169. ondirarg(start: number, endIndex: number): void
  170. ondirmodifier(start: number, endIndex: number): void
  171. oncomment(start: number, endIndex: number): void
  172. oncdata(start: number, endIndex: number): void
  173. // onprocessinginstruction(start: number, endIndex: number): void
  174. // ondeclaration(start: number, endIndex: number): void
  175. onend(): void
  176. }
  177. /**
  178. * Sequences used to match longer strings.
  179. *
  180. * We don't have `Script`, `Style`, or `Title` here. Instead, we re-use the *End
  181. * sequences with an increased offset.
  182. */
  183. const Sequences = {
  184. Cdata: new Uint8Array([0x43, 0x44, 0x41, 0x54, 0x41, 0x5b]), // CDATA[
  185. CdataEnd: new Uint8Array([0x5d, 0x5d, 0x3e]), // ]]>
  186. CommentEnd: new Uint8Array([0x2d, 0x2d, 0x3e]), // `-->`
  187. ScriptEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74]), // `</script`
  188. StyleEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65]), // `</style`
  189. TitleEnd: new Uint8Array([0x3c, 0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65]), // `</title`
  190. TextareaEnd: new Uint8Array([
  191. 0x3c, 0x2f, 116, 101, 120, 116, 97, 114, 101, 97
  192. ]) // `</textarea
  193. }
  194. export default class Tokenizer {
  195. /** The current state the tokenizer is in. */
  196. private state = State.Text
  197. /** The read buffer. */
  198. private buffer = ''
  199. /** The beginning of the section that is currently being read. */
  200. private sectionStart = 0
  201. /** The index within the buffer that we are currently looking at. */
  202. private index = 0
  203. /** The start of the last entity. */
  204. private entityStart = 0
  205. /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
  206. private baseState = State.Text
  207. /** For special parsing behavior inside of script and style tags. */
  208. public inRCDATA = false
  209. /** Reocrd newline positions for fast line / column calculation */
  210. private newlines: number[] = []
  211. private readonly entityDecoder?: EntityDecoder
  212. constructor(
  213. private readonly stack: ElementNode[],
  214. private readonly cbs: Callbacks
  215. ) {
  216. if (!__BROWSER__) {
  217. this.entityDecoder = new EntityDecoder(htmlDecodeTree, (cp, consumed) =>
  218. this.emitCodePoint(cp, consumed)
  219. )
  220. }
  221. }
  222. public mode = ParseMode.BASE
  223. public reset(): void {
  224. this.state = State.Text
  225. this.mode = ParseMode.BASE
  226. this.buffer = ''
  227. this.sectionStart = 0
  228. this.index = 0
  229. this.baseState = State.Text
  230. this.currentSequence = undefined!
  231. this.newlines.length = 0
  232. this.delimiterOpen = defaultDelimitersOpen
  233. this.delimiterClose = defaultDelimitersClose
  234. }
  235. /**
  236. * Generate Position object with line / column information using recorded
  237. * newline positions. We know the index is always going to be an already
  238. * processed index, so all the newlines up to this index should have been
  239. * recorded.
  240. */
  241. public getPos(index: number): Position {
  242. let line = 1
  243. let column = index + 1
  244. for (let i = this.newlines.length - 1; i >= 0; i--) {
  245. const newlineIndex = this.newlines[i]
  246. if (index > newlineIndex) {
  247. line = i + 2
  248. column = index - newlineIndex
  249. break
  250. }
  251. }
  252. return {
  253. column,
  254. line,
  255. offset: index
  256. }
  257. }
  258. private stateText(c: number): void {
  259. if (c === CharCodes.Lt) {
  260. if (this.index > this.sectionStart) {
  261. this.cbs.ontext(this.sectionStart, this.index)
  262. }
  263. this.state = State.BeforeTagName
  264. this.sectionStart = this.index
  265. } else if (!__BROWSER__ && c === CharCodes.Amp) {
  266. this.startEntity()
  267. } else if (c === this.delimiterOpen[0]) {
  268. this.state = State.InterpolationOpen
  269. this.delimiterIndex = 0
  270. this.stateInterpolationOpen(c)
  271. }
  272. }
  273. public delimiterOpen: Uint8Array = defaultDelimitersOpen
  274. public delimiterClose: Uint8Array = defaultDelimitersClose
  275. private delimiterIndex = -1
  276. private stateInterpolationOpen(c: number): void {
  277. if (c === this.delimiterOpen[this.delimiterIndex]) {
  278. if (this.delimiterIndex === this.delimiterOpen.length - 1) {
  279. const start = this.index + 1 - this.delimiterOpen.length
  280. if (start > this.sectionStart) {
  281. this.cbs.ontext(this.sectionStart, start)
  282. }
  283. this.state = State.Interpolation
  284. this.sectionStart = start
  285. } else {
  286. this.delimiterIndex++
  287. }
  288. } else if (this.inRCDATA) {
  289. this.state = State.InSpecialTag
  290. this.stateInSpecialTag(c)
  291. } else {
  292. this.state = State.Text
  293. this.stateText(c)
  294. }
  295. }
  296. private stateInterpolation(c: number): void {
  297. if (c === this.delimiterClose[0]) {
  298. this.state = State.InterpolationClose
  299. this.delimiterIndex = 0
  300. this.stateInterpolationClose(c)
  301. }
  302. }
  303. private stateInterpolationClose(c: number) {
  304. if (c === this.delimiterClose[this.delimiterIndex]) {
  305. if (this.delimiterIndex === this.delimiterClose.length - 1) {
  306. this.cbs.oninterpolation(this.sectionStart, this.index + 1)
  307. if (this.inRCDATA) {
  308. this.state = State.InSpecialTag
  309. } else {
  310. this.state = State.Text
  311. }
  312. this.sectionStart = this.index + 1
  313. } else {
  314. this.delimiterIndex++
  315. }
  316. } else {
  317. this.state = State.Interpolation
  318. this.stateInterpolation(c)
  319. }
  320. }
  321. private currentSequence: Uint8Array = undefined!
  322. private sequenceIndex = 0
  323. private stateSpecialStartSequence(c: number): void {
  324. const isEnd = this.sequenceIndex === this.currentSequence.length
  325. const isMatch = isEnd
  326. ? // If we are at the end of the sequence, make sure the tag name has ended
  327. isEndOfTagSection(c)
  328. : // Otherwise, do a case-insensitive comparison
  329. (c | 0x20) === this.currentSequence[this.sequenceIndex]
  330. if (!isMatch) {
  331. this.inRCDATA = false
  332. } else if (!isEnd) {
  333. this.sequenceIndex++
  334. return
  335. }
  336. this.sequenceIndex = 0
  337. this.state = State.InTagName
  338. this.stateInTagName(c)
  339. }
  340. /** Look for an end tag. For <title> and <textarea>, also decode entities. */
  341. private stateInSpecialTag(c: number): void {
  342. if (this.sequenceIndex === this.currentSequence.length) {
  343. if (c === CharCodes.Gt || isWhitespace(c)) {
  344. const endOfText = this.index - this.currentSequence.length
  345. if (this.sectionStart < endOfText) {
  346. // Spoof the index so that reported locations match up.
  347. const actualIndex = this.index
  348. this.index = endOfText
  349. this.cbs.ontext(this.sectionStart, endOfText)
  350. this.index = actualIndex
  351. }
  352. this.sectionStart = endOfText + 2 // Skip over the `</`
  353. this.stateInClosingTagName(c)
  354. this.inRCDATA = false
  355. return // We are done; skip the rest of the function.
  356. }
  357. this.sequenceIndex = 0
  358. }
  359. if ((c | 0x20) === this.currentSequence[this.sequenceIndex]) {
  360. this.sequenceIndex += 1
  361. } else if (this.sequenceIndex === 0) {
  362. if (
  363. this.currentSequence === Sequences.TitleEnd ||
  364. (this.currentSequence === Sequences.TextareaEnd &&
  365. !(this.mode === ParseMode.SFC && this.stack.length === 0))
  366. ) {
  367. // We have to parse entities in <title> and <textarea> tags.
  368. if (!__BROWSER__ && c === CharCodes.Amp) {
  369. this.startEntity()
  370. } else if (c === this.delimiterOpen[0]) {
  371. // We also need to handle interpolation
  372. this.state = State.InterpolationOpen
  373. this.delimiterIndex = 0
  374. this.stateInterpolationOpen(c)
  375. }
  376. } else if (this.fastForwardTo(CharCodes.Lt)) {
  377. // Outside of <title> and <textarea> tags, we can fast-forward.
  378. this.sequenceIndex = 1
  379. }
  380. } else {
  381. // If we see a `<`, set the sequence index to 1; useful for eg. `<</script>`.
  382. this.sequenceIndex = Number(c === CharCodes.Lt)
  383. }
  384. }
  385. private stateCDATASequence(c: number): void {
  386. if (c === Sequences.Cdata[this.sequenceIndex]) {
  387. if (++this.sequenceIndex === Sequences.Cdata.length) {
  388. this.state = State.InCommentLike
  389. this.currentSequence = Sequences.CdataEnd
  390. this.sequenceIndex = 0
  391. this.sectionStart = this.index + 1
  392. }
  393. } else {
  394. this.sequenceIndex = 0
  395. this.state = State.InDeclaration
  396. this.stateInDeclaration(c) // Reconsume the character
  397. }
  398. }
  399. /**
  400. * When we wait for one specific character, we can speed things up
  401. * by skipping through the buffer until we find it.
  402. *
  403. * @returns Whether the character was found.
  404. */
  405. private fastForwardTo(c: number): boolean {
  406. while (++this.index < this.buffer.length) {
  407. if (this.buffer.charCodeAt(this.index) === c) {
  408. return true
  409. }
  410. }
  411. /*
  412. * We increment the index at the end of the `parse` loop,
  413. * so set it to `buffer.length - 1` here.
  414. *
  415. * TODO: Refactor `parse` to increment index before calling states.
  416. */
  417. this.index = this.buffer.length - 1
  418. return false
  419. }
  420. /**
  421. * Comments and CDATA end with `-->` and `]]>`.
  422. *
  423. * Their common qualities are:
  424. * - Their end sequences have a distinct character they start with.
  425. * - That character is then repeated, so we have to check multiple repeats.
  426. * - All characters but the start character of the sequence can be skipped.
  427. */
  428. private stateInCommentLike(c: number): void {
  429. if (c === this.currentSequence[this.sequenceIndex]) {
  430. if (++this.sequenceIndex === this.currentSequence.length) {
  431. if (this.currentSequence === Sequences.CdataEnd) {
  432. this.cbs.oncdata(this.sectionStart, this.index - 2)
  433. } else {
  434. this.cbs.oncomment(this.sectionStart, this.index - 2)
  435. }
  436. this.sequenceIndex = 0
  437. this.sectionStart = this.index + 1
  438. this.state = State.Text
  439. }
  440. } else if (this.sequenceIndex === 0) {
  441. // Fast-forward to the first character of the sequence
  442. if (this.fastForwardTo(this.currentSequence[0])) {
  443. this.sequenceIndex = 1
  444. }
  445. } else if (c !== this.currentSequence[this.sequenceIndex - 1]) {
  446. // Allow long sequences, eg. --->, ]]]>
  447. this.sequenceIndex = 0
  448. }
  449. }
  450. private startSpecial(sequence: Uint8Array, offset: number) {
  451. this.inRCDATA = true
  452. this.currentSequence = sequence
  453. this.sequenceIndex = offset
  454. this.state = State.SpecialStartSequence
  455. }
  456. private stateBeforeTagName(c: number): void {
  457. if (c === CharCodes.ExclamationMark) {
  458. this.state = State.BeforeDeclaration
  459. this.sectionStart = this.index + 1
  460. } else if (c === CharCodes.Questionmark) {
  461. this.state = State.InProcessingInstruction
  462. this.sectionStart = this.index + 1
  463. } else if (isTagStartChar(c)) {
  464. this.sectionStart = this.index
  465. if (this.mode === ParseMode.BASE) {
  466. // no special tags in base mode
  467. this.state = State.InTagName
  468. } else if (this.mode === ParseMode.SFC && this.stack.length === 0) {
  469. // SFC mode + root level
  470. // - everything except <template> is RAWTEXT
  471. // - <template> with lang other than html is also RAWTEXT
  472. this.state = State.InSFCRootTagName
  473. } else {
  474. // HTML mode
  475. // - <script>, <style> RAWTEXT
  476. // - <title>, <textarea> RCDATA
  477. const lower = c | 0x20
  478. if (lower === 116 /* t */) {
  479. this.state = State.BeforeSpecialT
  480. } else {
  481. this.state =
  482. lower === 115 /* s */ ? State.BeforeSpecialS : State.InTagName
  483. }
  484. }
  485. } else if (c === CharCodes.Slash) {
  486. this.state = State.BeforeClosingTagName
  487. } else {
  488. this.state = State.Text
  489. this.stateText(c)
  490. }
  491. }
  492. private stateInTagName(c: number): void {
  493. if (isEndOfTagSection(c)) {
  494. this.handleTagName(c)
  495. }
  496. }
  497. private stateInSFCRootTagName(c: number): void {
  498. if (isEndOfTagSection(c)) {
  499. const tag = this.buffer.slice(this.sectionStart, this.index)
  500. if (tag !== 'template') {
  501. this.inRCDATA = true
  502. this.currentSequence = toCharCodes(`</` + tag)
  503. }
  504. this.handleTagName(c)
  505. }
  506. }
  507. private handleTagName(c: number) {
  508. this.cbs.onopentagname(this.sectionStart, this.index)
  509. this.sectionStart = -1
  510. this.state = State.BeforeAttributeName
  511. this.stateBeforeAttributeName(c)
  512. }
  513. private stateBeforeClosingTagName(c: number): void {
  514. if (isWhitespace(c)) {
  515. // Ignore
  516. } else if (c === CharCodes.Gt) {
  517. this.state = State.Text
  518. } else {
  519. this.state = isTagStartChar(c)
  520. ? State.InClosingTagName
  521. : State.InSpecialComment
  522. this.sectionStart = this.index
  523. }
  524. }
  525. private stateInClosingTagName(c: number): void {
  526. if (c === CharCodes.Gt || isWhitespace(c)) {
  527. this.cbs.onclosetag(this.sectionStart, this.index)
  528. this.sectionStart = -1
  529. this.state = State.AfterClosingTagName
  530. this.stateAfterClosingTagName(c)
  531. }
  532. }
  533. private stateAfterClosingTagName(c: number): void {
  534. // Skip everything until ">"
  535. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  536. this.state = State.Text
  537. this.sectionStart = this.index + 1
  538. }
  539. }
  540. private stateBeforeAttributeName(c: number): void {
  541. if (c === CharCodes.Gt) {
  542. this.cbs.onopentagend(this.index)
  543. if (this.inRCDATA) {
  544. this.state = State.InSpecialTag
  545. this.sequenceIndex = 0
  546. } else {
  547. this.state = State.Text
  548. }
  549. this.sectionStart = this.index + 1
  550. } else if (c === CharCodes.Slash) {
  551. this.state = State.InSelfClosingTag
  552. } else if (!isWhitespace(c)) {
  553. this.handleAttributeStart(c)
  554. }
  555. }
  556. private handleAttributeStart(c: number) {
  557. if (
  558. c === CharCodes.LowerV &&
  559. this.buffer.charCodeAt(this.index + 1) === CharCodes.Dash
  560. ) {
  561. this.state = State.InDirectiveName
  562. this.sectionStart = this.index
  563. } else if (
  564. c === CharCodes.Dot ||
  565. c === CharCodes.Colon ||
  566. c === CharCodes.At ||
  567. c === CharCodes.Number
  568. ) {
  569. this.cbs.ondirname(this.index, this.index + 1)
  570. this.state = State.InDirectiveArg
  571. this.sectionStart = this.index + 1
  572. } else {
  573. this.state = State.InAttributeName
  574. this.sectionStart = this.index
  575. }
  576. }
  577. private stateInSelfClosingTag(c: number): void {
  578. if (c === CharCodes.Gt) {
  579. this.cbs.onselfclosingtag(this.index)
  580. this.state = State.Text
  581. this.sectionStart = this.index + 1
  582. this.inRCDATA = false // Reset special state, in case of self-closing special tags
  583. } else if (!isWhitespace(c)) {
  584. this.state = State.BeforeAttributeName
  585. this.stateBeforeAttributeName(c)
  586. }
  587. }
  588. private stateInAttributeName(c: number): void {
  589. if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  590. this.cbs.onattribname(this.sectionStart, this.index)
  591. this.handleAttributeNameEnd(c)
  592. }
  593. }
  594. private stateInDirectiveName(c: number): void {
  595. if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  596. this.cbs.ondirname(this.sectionStart, this.index)
  597. this.handleAttributeNameEnd(c)
  598. } else if (c === CharCodes.Colon) {
  599. this.cbs.ondirname(this.sectionStart, this.index)
  600. this.state = State.InDirectiveArg
  601. this.sectionStart = this.index + 1
  602. } else if (c === CharCodes.Dot) {
  603. this.cbs.ondirname(this.sectionStart, this.index)
  604. this.state = State.InDirectiveModifier
  605. this.sectionStart = this.index + 1
  606. }
  607. }
  608. private stateInDirectiveArg(c: number): void {
  609. if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  610. this.cbs.ondirarg(this.sectionStart, this.index)
  611. this.handleAttributeNameEnd(c)
  612. } else if (c === CharCodes.LeftSqaure) {
  613. this.state = State.InDirectiveDynamicArg
  614. } else if (c === CharCodes.Dot) {
  615. this.cbs.ondirarg(this.sectionStart, this.index)
  616. this.state = State.InDirectiveModifier
  617. this.sectionStart = this.index + 1
  618. }
  619. }
  620. private stateInDynamicDirectiveArg(c: number): void {
  621. if (c === CharCodes.RightSquare) {
  622. this.state = State.InDirectiveArg
  623. } else if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  624. // TODO emit error
  625. }
  626. }
  627. private stateInDirectiveModifier(c: number): void {
  628. if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  629. this.cbs.ondirmodifier(this.sectionStart, this.index)
  630. this.handleAttributeNameEnd(c)
  631. } else if (c === CharCodes.Dot) {
  632. this.cbs.ondirmodifier(this.sectionStart, this.index)
  633. this.sectionStart = this.index + 1
  634. }
  635. }
  636. private handleAttributeNameEnd(c: number): void {
  637. this.sectionStart = this.index
  638. this.state = State.AfterAttributeName
  639. this.cbs.onattribnameend(this.index)
  640. this.stateAfterAttributeName(c)
  641. }
  642. private stateAfterAttributeName(c: number): void {
  643. if (c === CharCodes.Eq) {
  644. this.state = State.BeforeAttributeValue
  645. } else if (c === CharCodes.Slash || c === CharCodes.Gt) {
  646. this.cbs.onattribend(QuoteType.NoValue, this.sectionStart)
  647. this.sectionStart = -1
  648. this.state = State.BeforeAttributeName
  649. this.stateBeforeAttributeName(c)
  650. } else if (!isWhitespace(c)) {
  651. this.cbs.onattribend(QuoteType.NoValue, this.sectionStart)
  652. this.handleAttributeStart(c)
  653. }
  654. }
  655. private stateBeforeAttributeValue(c: number): void {
  656. if (c === CharCodes.DoubleQuote) {
  657. this.state = State.InAttributeValueDq
  658. this.sectionStart = this.index + 1
  659. } else if (c === CharCodes.SingleQuote) {
  660. this.state = State.InAttributeValueSq
  661. this.sectionStart = this.index + 1
  662. } else if (!isWhitespace(c)) {
  663. this.sectionStart = this.index
  664. this.state = State.InAttributeValueNq
  665. this.stateInAttributeValueNoQuotes(c) // Reconsume token
  666. }
  667. }
  668. private handleInAttributeValue(c: number, quote: number) {
  669. if (c === quote || (__BROWSER__ && this.fastForwardTo(quote))) {
  670. this.cbs.onattribdata(this.sectionStart, this.index)
  671. this.sectionStart = -1
  672. this.cbs.onattribend(
  673. quote === CharCodes.DoubleQuote ? QuoteType.Double : QuoteType.Single,
  674. this.index + 1
  675. )
  676. this.state = State.BeforeAttributeName
  677. } else if (!__BROWSER__ && c === CharCodes.Amp) {
  678. this.startEntity()
  679. }
  680. }
  681. private stateInAttributeValueDoubleQuotes(c: number): void {
  682. this.handleInAttributeValue(c, CharCodes.DoubleQuote)
  683. }
  684. private stateInAttributeValueSingleQuotes(c: number): void {
  685. this.handleInAttributeValue(c, CharCodes.SingleQuote)
  686. }
  687. private stateInAttributeValueNoQuotes(c: number): void {
  688. if (isWhitespace(c) || c === CharCodes.Gt) {
  689. this.cbs.onattribdata(this.sectionStart, this.index)
  690. this.sectionStart = -1
  691. this.cbs.onattribend(QuoteType.Unquoted, this.index)
  692. this.state = State.BeforeAttributeName
  693. this.stateBeforeAttributeName(c)
  694. } else if (!__BROWSER__ && c === CharCodes.Amp) {
  695. this.startEntity()
  696. }
  697. }
  698. private stateBeforeDeclaration(c: number): void {
  699. if (c === CharCodes.OpeningSquareBracket) {
  700. this.state = State.CDATASequence
  701. this.sequenceIndex = 0
  702. } else {
  703. this.state =
  704. c === CharCodes.Dash ? State.BeforeComment : State.InDeclaration
  705. }
  706. }
  707. private stateInDeclaration(c: number): void {
  708. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  709. // this.cbs.ondeclaration(this.sectionStart, this.index)
  710. this.state = State.Text
  711. this.sectionStart = this.index + 1
  712. }
  713. }
  714. private stateInProcessingInstruction(c: number): void {
  715. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  716. // this.cbs.onprocessinginstruction(this.sectionStart, this.index)
  717. this.state = State.Text
  718. this.sectionStart = this.index + 1
  719. }
  720. }
  721. private stateBeforeComment(c: number): void {
  722. if (c === CharCodes.Dash) {
  723. this.state = State.InCommentLike
  724. this.currentSequence = Sequences.CommentEnd
  725. // Allow short comments (eg. <!-->)
  726. this.sequenceIndex = 2
  727. this.sectionStart = this.index + 1
  728. } else {
  729. this.state = State.InDeclaration
  730. }
  731. }
  732. private stateInSpecialComment(c: number): void {
  733. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  734. this.cbs.oncomment(this.sectionStart, this.index)
  735. this.state = State.Text
  736. this.sectionStart = this.index + 1
  737. }
  738. }
  739. private stateBeforeSpecialS(c: number): void {
  740. const lower = c | 0x20
  741. if (lower === Sequences.ScriptEnd[3]) {
  742. this.startSpecial(Sequences.ScriptEnd, 4)
  743. } else if (lower === Sequences.StyleEnd[3]) {
  744. this.startSpecial(Sequences.StyleEnd, 4)
  745. } else {
  746. this.state = State.InTagName
  747. this.stateInTagName(c) // Consume the token again
  748. }
  749. }
  750. private stateBeforeSpecialT(c: number): void {
  751. const lower = c | 0x20
  752. if (lower === Sequences.TitleEnd[3]) {
  753. this.startSpecial(Sequences.TitleEnd, 4)
  754. } else if (lower === Sequences.TextareaEnd[3]) {
  755. this.startSpecial(Sequences.TextareaEnd, 4)
  756. } else {
  757. this.state = State.InTagName
  758. this.stateInTagName(c) // Consume the token again
  759. }
  760. }
  761. private startEntity() {
  762. if (!__BROWSER__) {
  763. this.baseState = this.state
  764. this.state = State.InEntity
  765. this.entityStart = this.index
  766. this.entityDecoder!.startEntity(
  767. this.baseState === State.Text || this.baseState === State.InSpecialTag
  768. ? DecodingMode.Legacy
  769. : DecodingMode.Attribute
  770. )
  771. }
  772. }
  773. private stateInEntity(): void {
  774. if (!__BROWSER__) {
  775. const length = this.entityDecoder!.write(this.buffer, this.index)
  776. // If `length` is positive, we are done with the entity.
  777. if (length >= 0) {
  778. this.state = this.baseState
  779. if (length === 0) {
  780. this.index = this.entityStart
  781. }
  782. } else {
  783. // Mark buffer as consumed.
  784. this.index = this.buffer.length - 1
  785. }
  786. }
  787. }
  788. /**
  789. * Iterates through the buffer, calling the function corresponding to the current state.
  790. *
  791. * States that are more likely to be hit are higher up, as a performance improvement.
  792. */
  793. public parse(input: string) {
  794. this.buffer = input
  795. while (this.index < this.buffer.length) {
  796. const c = this.buffer.charCodeAt(this.index)
  797. switch (this.state) {
  798. case State.Text: {
  799. this.stateText(c)
  800. break
  801. }
  802. case State.InterpolationOpen: {
  803. this.stateInterpolationOpen(c)
  804. break
  805. }
  806. case State.Interpolation: {
  807. this.stateInterpolation(c)
  808. break
  809. }
  810. case State.InterpolationClose: {
  811. this.stateInterpolationClose(c)
  812. break
  813. }
  814. case State.SpecialStartSequence: {
  815. this.stateSpecialStartSequence(c)
  816. break
  817. }
  818. case State.InSpecialTag: {
  819. this.stateInSpecialTag(c)
  820. break
  821. }
  822. case State.CDATASequence: {
  823. this.stateCDATASequence(c)
  824. break
  825. }
  826. case State.InAttributeValueDq: {
  827. this.stateInAttributeValueDoubleQuotes(c)
  828. break
  829. }
  830. case State.InAttributeName: {
  831. this.stateInAttributeName(c)
  832. break
  833. }
  834. case State.InDirectiveName: {
  835. this.stateInDirectiveName(c)
  836. break
  837. }
  838. case State.InDirectiveArg: {
  839. this.stateInDirectiveArg(c)
  840. break
  841. }
  842. case State.InDirectiveDynamicArg: {
  843. this.stateInDynamicDirectiveArg(c)
  844. break
  845. }
  846. case State.InDirectiveModifier: {
  847. this.stateInDirectiveModifier(c)
  848. break
  849. }
  850. case State.InCommentLike: {
  851. this.stateInCommentLike(c)
  852. break
  853. }
  854. case State.InSpecialComment: {
  855. this.stateInSpecialComment(c)
  856. break
  857. }
  858. case State.BeforeAttributeName: {
  859. this.stateBeforeAttributeName(c)
  860. break
  861. }
  862. case State.InTagName: {
  863. this.stateInTagName(c)
  864. break
  865. }
  866. case State.InSFCRootTagName: {
  867. this.stateInSFCRootTagName(c)
  868. break
  869. }
  870. case State.InClosingTagName: {
  871. this.stateInClosingTagName(c)
  872. break
  873. }
  874. case State.BeforeTagName: {
  875. this.stateBeforeTagName(c)
  876. break
  877. }
  878. case State.AfterAttributeName: {
  879. this.stateAfterAttributeName(c)
  880. break
  881. }
  882. case State.InAttributeValueSq: {
  883. this.stateInAttributeValueSingleQuotes(c)
  884. break
  885. }
  886. case State.BeforeAttributeValue: {
  887. this.stateBeforeAttributeValue(c)
  888. break
  889. }
  890. case State.BeforeClosingTagName: {
  891. this.stateBeforeClosingTagName(c)
  892. break
  893. }
  894. case State.AfterClosingTagName: {
  895. this.stateAfterClosingTagName(c)
  896. break
  897. }
  898. case State.BeforeSpecialS: {
  899. this.stateBeforeSpecialS(c)
  900. break
  901. }
  902. case State.BeforeSpecialT: {
  903. this.stateBeforeSpecialT(c)
  904. break
  905. }
  906. case State.InAttributeValueNq: {
  907. this.stateInAttributeValueNoQuotes(c)
  908. break
  909. }
  910. case State.InSelfClosingTag: {
  911. this.stateInSelfClosingTag(c)
  912. break
  913. }
  914. case State.InDeclaration: {
  915. this.stateInDeclaration(c)
  916. break
  917. }
  918. case State.BeforeDeclaration: {
  919. this.stateBeforeDeclaration(c)
  920. break
  921. }
  922. case State.BeforeComment: {
  923. this.stateBeforeComment(c)
  924. break
  925. }
  926. case State.InProcessingInstruction: {
  927. this.stateInProcessingInstruction(c)
  928. break
  929. }
  930. case State.InEntity: {
  931. this.stateInEntity()
  932. break
  933. }
  934. }
  935. if (c === CharCodes.NewLine) {
  936. this.newlines.push(this.index)
  937. }
  938. this.index++
  939. }
  940. this.cleanup()
  941. this.finish()
  942. }
  943. /**
  944. * Remove data that has already been consumed from the buffer.
  945. */
  946. private cleanup() {
  947. // If we are inside of text or attributes, emit what we already have.
  948. if (this.sectionStart !== this.index) {
  949. if (
  950. this.state === State.Text ||
  951. (this.state === State.InSpecialTag && this.sequenceIndex === 0)
  952. ) {
  953. this.cbs.ontext(this.sectionStart, this.index)
  954. this.sectionStart = this.index
  955. } else if (
  956. this.state === State.InAttributeValueDq ||
  957. this.state === State.InAttributeValueSq ||
  958. this.state === State.InAttributeValueNq
  959. ) {
  960. this.cbs.onattribdata(this.sectionStart, this.index)
  961. this.sectionStart = this.index
  962. }
  963. }
  964. }
  965. private finish() {
  966. if (!__BROWSER__ && this.state === State.InEntity) {
  967. this.entityDecoder!.end()
  968. this.state = this.baseState
  969. }
  970. this.handleTrailingData()
  971. this.cbs.onend()
  972. }
  973. /** Handle any trailing data. */
  974. private handleTrailingData() {
  975. const endIndex = this.buffer.length
  976. // If there is no remaining data, we are done.
  977. if (this.sectionStart >= endIndex) {
  978. return
  979. }
  980. if (this.state === State.InCommentLike) {
  981. if (this.currentSequence === Sequences.CdataEnd) {
  982. this.cbs.oncdata(this.sectionStart, endIndex)
  983. } else {
  984. this.cbs.oncomment(this.sectionStart, endIndex)
  985. }
  986. } else if (
  987. this.state === State.InTagName ||
  988. this.state === State.BeforeAttributeName ||
  989. this.state === State.BeforeAttributeValue ||
  990. this.state === State.AfterAttributeName ||
  991. this.state === State.InAttributeName ||
  992. this.state === State.InDirectiveName ||
  993. this.state === State.InDirectiveArg ||
  994. this.state === State.InDirectiveDynamicArg ||
  995. this.state === State.InDirectiveModifier ||
  996. this.state === State.InAttributeValueSq ||
  997. this.state === State.InAttributeValueDq ||
  998. this.state === State.InAttributeValueNq ||
  999. this.state === State.InClosingTagName
  1000. ) {
  1001. /*
  1002. * If we are currently in an opening or closing tag, us not calling the
  1003. * respective callback signals that the tag should be ignored.
  1004. */
  1005. } else {
  1006. this.cbs.ontext(this.sectionStart, endIndex)
  1007. }
  1008. }
  1009. private emitCodePoint(cp: number, consumed: number): void {
  1010. if (!__BROWSER__) {
  1011. if (
  1012. this.baseState !== State.Text &&
  1013. this.baseState !== State.InSpecialTag
  1014. ) {
  1015. if (this.sectionStart < this.entityStart) {
  1016. this.cbs.onattribdata(this.sectionStart, this.entityStart)
  1017. }
  1018. this.sectionStart = this.entityStart + consumed
  1019. this.index = this.sectionStart - 1
  1020. this.cbs.onattribentity(
  1021. fromCodePoint(cp),
  1022. this.entityStart,
  1023. this.sectionStart
  1024. )
  1025. } else {
  1026. if (this.sectionStart < this.entityStart) {
  1027. this.cbs.ontext(this.sectionStart, this.entityStart)
  1028. }
  1029. this.sectionStart = this.entityStart + consumed
  1030. this.index = this.sectionStart - 1
  1031. this.cbs.ontextentity(
  1032. fromCodePoint(cp),
  1033. this.entityStart,
  1034. this.sectionStart
  1035. )
  1036. }
  1037. }
  1038. }
  1039. }