Tokenizer.ts 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076
  1. /**
  2. * This Tokenizer is adapted from htmlparser2 under the MIT License listed at
  3. * https://github.com/fb55/htmlparser2/blob/master/LICENSE
  4. Copyright 2010, 2011, Chris Winberry <chris@winberry.net>. All rights reserved.
  5. Permission is hereby granted, free of charge, to any person obtaining a copy
  6. of this software and associated documentation files (the "Software"), to
  7. deal in the Software without restriction, including without limitation the
  8. rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  9. sell copies of the Software, and to permit persons to whom the Software is
  10. furnished to do so, subject to the following conditions:
  11. The above copyright notice and this permission notice shall be included in
  12. all copies or substantial portions of the Software.
  13. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  18. FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  19. IN THE SOFTWARE.
  20. */
  21. import {
  22. EntityDecoder,
  23. DecodingMode,
  24. htmlDecodeTree
  25. } from 'entities/lib/decode.js'
  26. import { ElementNode, Position } from '../ast'
  27. export const enum ParseMode {
  28. BASE,
  29. HTML,
  30. SFC
  31. }
  32. export const enum CharCodes {
  33. Tab = 0x9, // "\t"
  34. NewLine = 0xa, // "\n"
  35. FormFeed = 0xc, // "\f"
  36. CarriageReturn = 0xd, // "\r"
  37. Space = 0x20, // " "
  38. ExclamationMark = 0x21, // "!"
  39. Number = 0x23, // "#"
  40. Amp = 0x26, // "&"
  41. SingleQuote = 0x27, // "'"
  42. DoubleQuote = 0x22, // '"'
  43. Dash = 0x2d, // "-"
  44. Slash = 0x2f, // "/"
  45. Zero = 0x30, // "0"
  46. Nine = 0x39, // "9"
  47. Semi = 0x3b, // ";"
  48. Lt = 0x3c, // "<"
  49. Eq = 0x3d, // "="
  50. Gt = 0x3e, // ">"
  51. Questionmark = 0x3f, // "?"
  52. UpperA = 0x41, // "A"
  53. LowerA = 0x61, // "a"
  54. UpperF = 0x46, // "F"
  55. LowerF = 0x66, // "f"
  56. UpperZ = 0x5a, // "Z"
  57. LowerZ = 0x7a, // "z"
  58. LowerX = 0x78, // "x"
  59. OpeningSquareBracket = 0x5b, // "["
  60. LowerV = 0x76, // "v"
  61. Dot = 0x2e, // "."
  62. Colon = 0x3a, // ":"
  63. At = 0x40, // "@"
  64. LeftSqaure = 91, // "["
  65. RightSquare = 93 // "]"
  66. }
  67. const defaultDelimitersOpen = new Uint8Array([123, 123]) // "{{"
  68. const defaultDelimitersClose = new Uint8Array([125, 125]) // "}}"
  69. /** All the states the tokenizer can be in. */
  70. const enum State {
  71. Text = 1,
  72. // interpolation
  73. InterpolationOpen,
  74. Interpolation,
  75. InterpolationClose,
  76. // Tags
  77. BeforeTagName, // After <
  78. InTagName,
  79. InSelfClosingTag,
  80. BeforeClosingTagName,
  81. InClosingTagName,
  82. AfterClosingTagName,
  83. // Attributes
  84. BeforeAttributeName,
  85. InAttributeName,
  86. InDirectiveName,
  87. InDirectiveArg,
  88. InDirectiveDynamicArg,
  89. InDirectiveModifier,
  90. AfterAttributeName,
  91. BeforeAttributeValue,
  92. InAttributeValueDq, // "
  93. InAttributeValueSq, // '
  94. InAttributeValueNq,
  95. // Declarations
  96. BeforeDeclaration, // !
  97. InDeclaration,
  98. // Processing instructions
  99. InProcessingInstruction, // ?
  100. // Comments & CDATA
  101. BeforeComment,
  102. CDATASequence,
  103. InSpecialComment,
  104. InCommentLike,
  105. // Special tags
  106. BeforeSpecialS, // Decide if we deal with `<script` or `<style`
  107. BeforeSpecialT, // Decide if we deal with `<title` or `<textarea`
  108. SpecialStartSequence,
  109. InSpecialTag,
  110. InEntity,
  111. InSFCRootTagName
  112. }
  113. /**
  114. * HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a
  115. * tag name.
  116. */
  117. function isTagStartChar(c: number): boolean {
  118. return (
  119. (c >= CharCodes.LowerA && c <= CharCodes.LowerZ) ||
  120. (c >= CharCodes.UpperA && c <= CharCodes.UpperZ)
  121. )
  122. }
  123. export function isWhitespace(c: number): boolean {
  124. return (
  125. c === CharCodes.Space ||
  126. c === CharCodes.NewLine ||
  127. c === CharCodes.Tab ||
  128. c === CharCodes.FormFeed ||
  129. c === CharCodes.CarriageReturn
  130. )
  131. }
  132. function isEndOfTagSection(c: number): boolean {
  133. return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c)
  134. }
  135. export function toCharCodes(str: string): Uint8Array {
  136. const ret = new Uint8Array(str.length)
  137. for (let i = 0; i < str.length; i++) {
  138. ret[i] = str.charCodeAt(i)
  139. }
  140. return ret
  141. }
  142. export enum QuoteType {
  143. NoValue = 0,
  144. Unquoted = 1,
  145. Single = 2,
  146. Double = 3
  147. }
  148. export interface Callbacks {
  149. ontext(start: number, endIndex: number): void
  150. ontextentity(codepoint: number, endIndex: number): void
  151. oninterpolation(start: number, endIndex: number): void
  152. onopentagname(start: number, endIndex: number): void
  153. onopentagend(endIndex: number): void
  154. onselfclosingtag(endIndex: number): void
  155. onclosetag(start: number, endIndex: number): void
  156. onattribdata(start: number, endIndex: number): void
  157. onattribentity(codepoint: number): void
  158. onattribend(quote: QuoteType, endIndex: number): void
  159. onattribname(start: number, endIndex: number): void
  160. onattribnameend(endIndex: number): void
  161. ondirname(start: number, endIndex: number): void
  162. ondirarg(start: number, endIndex: number): void
  163. ondirmodifier(start: number, endIndex: number): void
  164. oncomment(start: number, endIndex: number): void
  165. oncdata(start: number, endIndex: number): void
  166. // onprocessinginstruction(start: number, endIndex: number): void
  167. // ondeclaration(start: number, endIndex: number): void
  168. onend(): void
  169. }
  170. /**
  171. * Sequences used to match longer strings.
  172. *
  173. * We don't have `Script`, `Style`, or `Title` here. Instead, we re-use the *End
  174. * sequences with an increased offset.
  175. */
  176. const Sequences = {
  177. Cdata: new Uint8Array([0x43, 0x44, 0x41, 0x54, 0x41, 0x5b]), // CDATA[
  178. CdataEnd: new Uint8Array([0x5d, 0x5d, 0x3e]), // ]]>
  179. CommentEnd: new Uint8Array([0x2d, 0x2d, 0x3e]), // `-->`
  180. ScriptEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74]), // `</script`
  181. StyleEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65]), // `</style`
  182. TitleEnd: new Uint8Array([0x3c, 0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65]), // `</title`
  183. TextareaEnd: new Uint8Array([
  184. 0x3c, 0x2f, 116, 101, 120, 116, 97, 114, 101, 97
  185. ]) // `</textarea
  186. }
  187. export default class Tokenizer {
  188. /** The current state the tokenizer is in. */
  189. private state = State.Text
  190. /** The read buffer. */
  191. private buffer = ''
  192. /** The beginning of the section that is currently being read. */
  193. private sectionStart = 0
  194. /** The index within the buffer that we are currently looking at. */
  195. private index = 0
  196. /** The start of the last entity. */
  197. private entityStart = 0
  198. /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
  199. private baseState = State.Text
  200. /** For special parsing behavior inside of script and style tags. */
  201. public inRCDATA = false
  202. /** Reocrd newline positions for fast line / column calculation */
  203. private newlines: number[] = []
  204. private readonly entityDecoder: EntityDecoder
  205. constructor(
  206. private readonly stack: ElementNode[],
  207. private readonly cbs: Callbacks
  208. ) {
  209. this.entityDecoder = new EntityDecoder(htmlDecodeTree, (cp, consumed) =>
  210. this.emitCodePoint(cp, consumed)
  211. )
  212. }
  213. public mode = ParseMode.BASE
  214. public reset(): void {
  215. this.state = State.Text
  216. this.mode = ParseMode.BASE
  217. this.buffer = ''
  218. this.sectionStart = 0
  219. this.index = 0
  220. this.baseState = State.Text
  221. this.currentSequence = undefined!
  222. this.newlines.length = 0
  223. this.delimiterOpen = defaultDelimitersOpen
  224. this.delimiterClose = defaultDelimitersClose
  225. }
  226. /**
  227. * Generate Position object with line / column information using recorded
  228. * newline positions. We know the index is always going to be an already
  229. * processed index, so all the newlines up to this index should have been
  230. * recorded.
  231. */
  232. public getPos(index: number): Position {
  233. let line = 1
  234. let column = index + 1
  235. for (let i = this.newlines.length - 1; i >= 0; i--) {
  236. const newlineIndex = this.newlines[i]
  237. if (index > newlineIndex) {
  238. line = i + 2
  239. column = index - newlineIndex
  240. break
  241. }
  242. }
  243. return {
  244. column,
  245. line,
  246. offset: index
  247. }
  248. }
  249. private stateText(c: number): void {
  250. if (c === CharCodes.Lt) {
  251. if (this.index > this.sectionStart) {
  252. this.cbs.ontext(this.sectionStart, this.index)
  253. }
  254. this.state = State.BeforeTagName
  255. this.sectionStart = this.index
  256. } else if (c === CharCodes.Amp) {
  257. this.startEntity()
  258. } else if (c === this.delimiterOpen[0]) {
  259. this.state = State.InterpolationOpen
  260. this.delimiterIndex = 0
  261. this.stateInterpolationOpen(c)
  262. }
  263. }
  264. public delimiterOpen: Uint8Array = defaultDelimitersOpen
  265. public delimiterClose: Uint8Array = defaultDelimitersClose
  266. private delimiterIndex = -1
  267. private stateInterpolationOpen(c: number): void {
  268. if (c === this.delimiterOpen[this.delimiterIndex]) {
  269. if (this.delimiterIndex === this.delimiterOpen.length - 1) {
  270. const start = this.index + 1 - this.delimiterOpen.length
  271. if (start > this.sectionStart) {
  272. this.cbs.ontext(this.sectionStart, start)
  273. }
  274. this.state = State.Interpolation
  275. this.sectionStart = start
  276. } else {
  277. this.delimiterIndex++
  278. }
  279. } else {
  280. this.state = State.Text
  281. this.stateText(c)
  282. }
  283. }
  284. private stateInterpolation(c: number): void {
  285. if (c === this.delimiterClose[0]) {
  286. this.state = State.InterpolationClose
  287. this.delimiterIndex = 0
  288. this.stateInterpolationClose(c)
  289. }
  290. }
  291. private stateInterpolationClose(c: number) {
  292. if (c === this.delimiterClose[this.delimiterIndex]) {
  293. if (this.delimiterIndex === this.delimiterClose.length - 1) {
  294. this.cbs.oninterpolation(this.sectionStart, this.index + 1)
  295. this.state = State.Text
  296. this.sectionStart = this.index + 1
  297. } else {
  298. this.delimiterIndex++
  299. }
  300. } else {
  301. this.state = State.Interpolation
  302. this.stateInterpolation(c)
  303. }
  304. }
  305. private currentSequence: Uint8Array = undefined!
  306. private sequenceIndex = 0
  307. private stateSpecialStartSequence(c: number): void {
  308. const isEnd = this.sequenceIndex === this.currentSequence.length
  309. const isMatch = isEnd
  310. ? // If we are at the end of the sequence, make sure the tag name has ended
  311. isEndOfTagSection(c)
  312. : // Otherwise, do a case-insensitive comparison
  313. (c | 0x20) === this.currentSequence[this.sequenceIndex]
  314. if (!isMatch) {
  315. this.inRCDATA = false
  316. } else if (!isEnd) {
  317. this.sequenceIndex++
  318. return
  319. }
  320. this.sequenceIndex = 0
  321. this.state = State.InTagName
  322. this.stateInTagName(c)
  323. }
  324. /** Look for an end tag. For <title> and <textarea>, also decode entities. */
  325. private stateInSpecialTag(c: number): void {
  326. if (this.sequenceIndex === this.currentSequence.length) {
  327. if (c === CharCodes.Gt || isWhitespace(c)) {
  328. const endOfText = this.index - this.currentSequence.length
  329. if (this.sectionStart < endOfText) {
  330. // Spoof the index so that reported locations match up.
  331. const actualIndex = this.index
  332. this.index = endOfText
  333. this.cbs.ontext(this.sectionStart, endOfText)
  334. this.index = actualIndex
  335. }
  336. this.sectionStart = endOfText + 2 // Skip over the `</`
  337. this.stateInClosingTagName(c)
  338. this.inRCDATA = false
  339. return // We are done; skip the rest of the function.
  340. }
  341. this.sequenceIndex = 0
  342. }
  343. if ((c | 0x20) === this.currentSequence[this.sequenceIndex]) {
  344. this.sequenceIndex += 1
  345. } else if (this.sequenceIndex === 0) {
  346. if (
  347. this.currentSequence === Sequences.TitleEnd ||
  348. (this.currentSequence === Sequences.TextareaEnd &&
  349. !(this.mode === ParseMode.SFC && this.stack.length === 0))
  350. ) {
  351. // We have to parse entities in <title> and <textarea> tags.
  352. if (c === CharCodes.Amp) {
  353. this.startEntity()
  354. }
  355. } else if (this.fastForwardTo(CharCodes.Lt)) {
  356. // Outside of <title> and <textarea> tags, we can fast-forward.
  357. this.sequenceIndex = 1
  358. }
  359. } else {
  360. // If we see a `<`, set the sequence index to 1; useful for eg. `<</script>`.
  361. this.sequenceIndex = Number(c === CharCodes.Lt)
  362. }
  363. }
  364. private stateCDATASequence(c: number): void {
  365. if (c === Sequences.Cdata[this.sequenceIndex]) {
  366. if (++this.sequenceIndex === Sequences.Cdata.length) {
  367. this.state = State.InCommentLike
  368. this.currentSequence = Sequences.CdataEnd
  369. this.sequenceIndex = 0
  370. this.sectionStart = this.index + 1
  371. }
  372. } else {
  373. this.sequenceIndex = 0
  374. this.state = State.InDeclaration
  375. this.stateInDeclaration(c) // Reconsume the character
  376. }
  377. }
  378. /**
  379. * When we wait for one specific character, we can speed things up
  380. * by skipping through the buffer until we find it.
  381. *
  382. * @returns Whether the character was found.
  383. */
  384. private fastForwardTo(c: number): boolean {
  385. while (++this.index < this.buffer.length) {
  386. if (this.buffer.charCodeAt(this.index) === c) {
  387. return true
  388. }
  389. }
  390. /*
  391. * We increment the index at the end of the `parse` loop,
  392. * so set it to `buffer.length - 1` here.
  393. *
  394. * TODO: Refactor `parse` to increment index before calling states.
  395. */
  396. this.index = this.buffer.length - 1
  397. return false
  398. }
  399. /**
  400. * Comments and CDATA end with `-->` and `]]>`.
  401. *
  402. * Their common qualities are:
  403. * - Their end sequences have a distinct character they start with.
  404. * - That character is then repeated, so we have to check multiple repeats.
  405. * - All characters but the start character of the sequence can be skipped.
  406. */
  407. private stateInCommentLike(c: number): void {
  408. if (c === this.currentSequence[this.sequenceIndex]) {
  409. if (++this.sequenceIndex === this.currentSequence.length) {
  410. if (this.currentSequence === Sequences.CdataEnd) {
  411. this.cbs.oncdata(this.sectionStart, this.index - 2)
  412. } else {
  413. this.cbs.oncomment(this.sectionStart, this.index - 2)
  414. }
  415. this.sequenceIndex = 0
  416. this.sectionStart = this.index + 1
  417. this.state = State.Text
  418. }
  419. } else if (this.sequenceIndex === 0) {
  420. // Fast-forward to the first character of the sequence
  421. if (this.fastForwardTo(this.currentSequence[0])) {
  422. this.sequenceIndex = 1
  423. }
  424. } else if (c !== this.currentSequence[this.sequenceIndex - 1]) {
  425. // Allow long sequences, eg. --->, ]]]>
  426. this.sequenceIndex = 0
  427. }
  428. }
  429. private startSpecial(sequence: Uint8Array, offset: number) {
  430. this.inRCDATA = true
  431. this.currentSequence = sequence
  432. this.sequenceIndex = offset
  433. this.state = State.SpecialStartSequence
  434. }
  435. private stateBeforeTagName(c: number): void {
  436. if (c === CharCodes.ExclamationMark) {
  437. this.state = State.BeforeDeclaration
  438. this.sectionStart = this.index + 1
  439. } else if (c === CharCodes.Questionmark) {
  440. this.state = State.InProcessingInstruction
  441. this.sectionStart = this.index + 1
  442. } else if (isTagStartChar(c)) {
  443. this.sectionStart = this.index
  444. if (this.mode === ParseMode.BASE) {
  445. // no special tags in base mode
  446. this.state = State.InTagName
  447. } else if (this.mode === ParseMode.SFC && this.stack.length === 0) {
  448. // SFC mode + root level
  449. // - everything except <template> is RAWTEXT
  450. // - <template> with lang other than html is also RAWTEXT
  451. this.state = State.InSFCRootTagName
  452. } else {
  453. // HTML mode
  454. // - <script>, <style> RAWTEXT
  455. // - <title>, <textarea> RCDATA
  456. const lower = c | 0x20
  457. if (lower === 116 /* t */) {
  458. this.state = State.BeforeSpecialT
  459. } else {
  460. this.state =
  461. lower === 115 /* s */ ? State.BeforeSpecialS : State.InTagName
  462. }
  463. }
  464. } else if (c === CharCodes.Slash) {
  465. this.state = State.BeforeClosingTagName
  466. } else {
  467. this.state = State.Text
  468. this.stateText(c)
  469. }
  470. }
  471. private stateInTagName(c: number): void {
  472. if (isEndOfTagSection(c)) {
  473. this.handleTagName(c)
  474. }
  475. }
  476. private stateInSFCRootTagName(c: number): void {
  477. if (isEndOfTagSection(c)) {
  478. const tag = this.buffer.slice(this.sectionStart, this.index)
  479. if (tag !== 'template') {
  480. this.inRCDATA = true
  481. this.currentSequence = toCharCodes(`</` + tag)
  482. }
  483. this.handleTagName(c)
  484. }
  485. }
  486. private handleTagName(c: number) {
  487. this.cbs.onopentagname(this.sectionStart, this.index)
  488. this.sectionStart = -1
  489. this.state = State.BeforeAttributeName
  490. this.stateBeforeAttributeName(c)
  491. }
  492. private stateBeforeClosingTagName(c: number): void {
  493. if (isWhitespace(c)) {
  494. // Ignore
  495. } else if (c === CharCodes.Gt) {
  496. this.state = State.Text
  497. } else {
  498. this.state = isTagStartChar(c)
  499. ? State.InClosingTagName
  500. : State.InSpecialComment
  501. this.sectionStart = this.index
  502. }
  503. }
  504. private stateInClosingTagName(c: number): void {
  505. if (c === CharCodes.Gt || isWhitespace(c)) {
  506. this.cbs.onclosetag(this.sectionStart, this.index)
  507. this.sectionStart = -1
  508. this.state = State.AfterClosingTagName
  509. this.stateAfterClosingTagName(c)
  510. }
  511. }
  512. private stateAfterClosingTagName(c: number): void {
  513. // Skip everything until ">"
  514. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  515. this.state = State.Text
  516. this.sectionStart = this.index + 1
  517. }
  518. }
  519. private stateBeforeAttributeName(c: number): void {
  520. if (c === CharCodes.Gt) {
  521. this.cbs.onopentagend(this.index)
  522. if (this.inRCDATA) {
  523. this.state = State.InSpecialTag
  524. this.sequenceIndex = 0
  525. } else {
  526. this.state = State.Text
  527. }
  528. this.sectionStart = this.index + 1
  529. } else if (c === CharCodes.Slash) {
  530. this.state = State.InSelfClosingTag
  531. } else if (!isWhitespace(c)) {
  532. this.handleAttributeStart(c)
  533. }
  534. }
  535. private handleAttributeStart(c: number) {
  536. if (
  537. c === CharCodes.LowerV &&
  538. this.buffer.charCodeAt(this.index + 1) === CharCodes.Dash
  539. ) {
  540. this.state = State.InDirectiveName
  541. this.sectionStart = this.index
  542. } else if (
  543. c === CharCodes.Dot ||
  544. c === CharCodes.Colon ||
  545. c === CharCodes.At ||
  546. c === CharCodes.Number
  547. ) {
  548. this.cbs.ondirname(this.index, this.index + 1)
  549. this.state = State.InDirectiveArg
  550. this.sectionStart = this.index + 1
  551. } else {
  552. this.state = State.InAttributeName
  553. this.sectionStart = this.index
  554. }
  555. }
  556. private stateInSelfClosingTag(c: number): void {
  557. if (c === CharCodes.Gt) {
  558. this.cbs.onselfclosingtag(this.index)
  559. this.state = State.Text
  560. this.sectionStart = this.index + 1
  561. this.inRCDATA = false // Reset special state, in case of self-closing special tags
  562. } else if (!isWhitespace(c)) {
  563. this.state = State.BeforeAttributeName
  564. this.stateBeforeAttributeName(c)
  565. }
  566. }
  567. private stateInAttributeName(c: number): void {
  568. if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  569. this.cbs.onattribname(this.sectionStart, this.index)
  570. this.handleAttributeNameEnd(c)
  571. }
  572. }
  573. private stateInDirectiveName(c: number): void {
  574. if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  575. this.cbs.ondirname(this.sectionStart, this.index)
  576. this.handleAttributeNameEnd(c)
  577. } else if (c === CharCodes.Colon) {
  578. this.cbs.ondirname(this.sectionStart, this.index)
  579. this.state = State.InDirectiveArg
  580. this.sectionStart = this.index + 1
  581. } else if (c === CharCodes.Dot) {
  582. this.cbs.ondirname(this.sectionStart, this.index)
  583. this.state = State.InDirectiveModifier
  584. this.sectionStart = this.index + 1
  585. }
  586. }
  587. private stateInDirectiveArg(c: number): void {
  588. if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  589. this.cbs.ondirarg(this.sectionStart, this.index)
  590. this.handleAttributeNameEnd(c)
  591. } else if (c === CharCodes.LeftSqaure) {
  592. this.state = State.InDirectiveDynamicArg
  593. } else if (c === CharCodes.Dot) {
  594. this.cbs.ondirarg(this.sectionStart, this.index)
  595. this.state = State.InDirectiveModifier
  596. this.sectionStart = this.index + 1
  597. }
  598. }
  599. private stateInDynamicDirectiveArg(c: number): void {
  600. if (c === CharCodes.RightSquare) {
  601. this.state = State.InDirectiveArg
  602. } else if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  603. // TODO emit error
  604. }
  605. }
  606. private stateInDirectiveModifier(c: number): void {
  607. if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  608. this.cbs.ondirmodifier(this.sectionStart, this.index)
  609. this.handleAttributeNameEnd(c)
  610. } else if (c === CharCodes.Dot) {
  611. this.cbs.ondirmodifier(this.sectionStart, this.index)
  612. this.sectionStart = this.index + 1
  613. }
  614. }
  615. private handleAttributeNameEnd(c: number): void {
  616. this.sectionStart = this.index
  617. this.state = State.AfterAttributeName
  618. this.cbs.onattribnameend(this.index)
  619. this.stateAfterAttributeName(c)
  620. }
  621. private stateAfterAttributeName(c: number): void {
  622. if (c === CharCodes.Eq) {
  623. this.state = State.BeforeAttributeValue
  624. } else if (c === CharCodes.Slash || c === CharCodes.Gt) {
  625. this.cbs.onattribend(QuoteType.NoValue, this.sectionStart)
  626. this.sectionStart = -1
  627. this.state = State.BeforeAttributeName
  628. this.stateBeforeAttributeName(c)
  629. } else if (!isWhitespace(c)) {
  630. this.cbs.onattribend(QuoteType.NoValue, this.sectionStart)
  631. this.handleAttributeStart(c)
  632. }
  633. }
  634. private stateBeforeAttributeValue(c: number): void {
  635. if (c === CharCodes.DoubleQuote) {
  636. this.state = State.InAttributeValueDq
  637. this.sectionStart = this.index + 1
  638. } else if (c === CharCodes.SingleQuote) {
  639. this.state = State.InAttributeValueSq
  640. this.sectionStart = this.index + 1
  641. } else if (!isWhitespace(c)) {
  642. this.sectionStart = this.index
  643. this.state = State.InAttributeValueNq
  644. this.stateInAttributeValueNoQuotes(c) // Reconsume token
  645. }
  646. }
  647. private handleInAttributeValue(c: number, quote: number) {
  648. if (c === quote) {
  649. this.cbs.onattribdata(this.sectionStart, this.index)
  650. this.sectionStart = -1
  651. this.cbs.onattribend(
  652. quote === CharCodes.DoubleQuote ? QuoteType.Double : QuoteType.Single,
  653. this.index + 1
  654. )
  655. this.state = State.BeforeAttributeName
  656. } else if (c === CharCodes.Amp) {
  657. this.startEntity()
  658. }
  659. }
  660. private stateInAttributeValueDoubleQuotes(c: number): void {
  661. this.handleInAttributeValue(c, CharCodes.DoubleQuote)
  662. }
  663. private stateInAttributeValueSingleQuotes(c: number): void {
  664. this.handleInAttributeValue(c, CharCodes.SingleQuote)
  665. }
  666. private stateInAttributeValueNoQuotes(c: number): void {
  667. if (isWhitespace(c) || c === CharCodes.Gt) {
  668. this.cbs.onattribdata(this.sectionStart, this.index)
  669. this.sectionStart = -1
  670. this.cbs.onattribend(QuoteType.Unquoted, this.index)
  671. this.state = State.BeforeAttributeName
  672. this.stateBeforeAttributeName(c)
  673. } else if (c === CharCodes.Amp) {
  674. this.startEntity()
  675. }
  676. }
  677. private stateBeforeDeclaration(c: number): void {
  678. if (c === CharCodes.OpeningSquareBracket) {
  679. this.state = State.CDATASequence
  680. this.sequenceIndex = 0
  681. } else {
  682. this.state =
  683. c === CharCodes.Dash ? State.BeforeComment : State.InDeclaration
  684. }
  685. }
  686. private stateInDeclaration(c: number): void {
  687. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  688. // this.cbs.ondeclaration(this.sectionStart, this.index)
  689. this.state = State.Text
  690. this.sectionStart = this.index + 1
  691. }
  692. }
  693. private stateInProcessingInstruction(c: number): void {
  694. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  695. // this.cbs.onprocessinginstruction(this.sectionStart, this.index)
  696. this.state = State.Text
  697. this.sectionStart = this.index + 1
  698. }
  699. }
  700. private stateBeforeComment(c: number): void {
  701. if (c === CharCodes.Dash) {
  702. this.state = State.InCommentLike
  703. this.currentSequence = Sequences.CommentEnd
  704. // Allow short comments (eg. <!-->)
  705. this.sequenceIndex = 2
  706. this.sectionStart = this.index + 1
  707. } else {
  708. this.state = State.InDeclaration
  709. }
  710. }
  711. private stateInSpecialComment(c: number): void {
  712. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  713. this.cbs.oncomment(this.sectionStart, this.index)
  714. this.state = State.Text
  715. this.sectionStart = this.index + 1
  716. }
  717. }
  718. private stateBeforeSpecialS(c: number): void {
  719. const lower = c | 0x20
  720. if (lower === Sequences.ScriptEnd[3]) {
  721. this.startSpecial(Sequences.ScriptEnd, 4)
  722. } else if (lower === Sequences.StyleEnd[3]) {
  723. this.startSpecial(Sequences.StyleEnd, 4)
  724. } else {
  725. this.state = State.InTagName
  726. this.stateInTagName(c) // Consume the token again
  727. }
  728. }
  729. private stateBeforeSpecialT(c: number): void {
  730. const lower = c | 0x20
  731. if (lower === Sequences.TitleEnd[3]) {
  732. this.startSpecial(Sequences.TitleEnd, 4)
  733. } else if (lower === Sequences.TextareaEnd[3]) {
  734. this.startSpecial(Sequences.TextareaEnd, 4)
  735. } else {
  736. this.state = State.InTagName
  737. this.stateInTagName(c) // Consume the token again
  738. }
  739. }
  740. private startEntity() {
  741. this.baseState = this.state
  742. this.state = State.InEntity
  743. this.entityStart = this.index
  744. this.entityDecoder.startEntity(
  745. this.baseState === State.Text || this.baseState === State.InSpecialTag
  746. ? DecodingMode.Legacy
  747. : DecodingMode.Attribute
  748. )
  749. }
  750. private stateInEntity(): void {
  751. const length = this.entityDecoder.write(this.buffer, this.index)
  752. // If `length` is positive, we are done with the entity.
  753. if (length >= 0) {
  754. this.state = this.baseState
  755. if (length === 0) {
  756. this.index = this.entityStart
  757. }
  758. } else {
  759. // Mark buffer as consumed.
  760. this.index = this.buffer.length - 1
  761. }
  762. }
  763. /**
  764. * Iterates through the buffer, calling the function corresponding to the current state.
  765. *
  766. * States that are more likely to be hit are higher up, as a performance improvement.
  767. */
  768. public parse(input: string) {
  769. this.buffer = input
  770. while (this.index < this.buffer.length) {
  771. const c = this.buffer.charCodeAt(this.index)
  772. switch (this.state) {
  773. case State.Text: {
  774. this.stateText(c)
  775. break
  776. }
  777. case State.InterpolationOpen: {
  778. this.stateInterpolationOpen(c)
  779. break
  780. }
  781. case State.Interpolation: {
  782. this.stateInterpolation(c)
  783. break
  784. }
  785. case State.InterpolationClose: {
  786. this.stateInterpolationClose(c)
  787. break
  788. }
  789. case State.SpecialStartSequence: {
  790. this.stateSpecialStartSequence(c)
  791. break
  792. }
  793. case State.InSpecialTag: {
  794. this.stateInSpecialTag(c)
  795. break
  796. }
  797. case State.CDATASequence: {
  798. this.stateCDATASequence(c)
  799. break
  800. }
  801. case State.InAttributeValueDq: {
  802. this.stateInAttributeValueDoubleQuotes(c)
  803. break
  804. }
  805. case State.InAttributeName: {
  806. this.stateInAttributeName(c)
  807. break
  808. }
  809. case State.InDirectiveName: {
  810. this.stateInDirectiveName(c)
  811. break
  812. }
  813. case State.InDirectiveArg: {
  814. this.stateInDirectiveArg(c)
  815. break
  816. }
  817. case State.InDirectiveDynamicArg: {
  818. this.stateInDynamicDirectiveArg(c)
  819. break
  820. }
  821. case State.InDirectiveModifier: {
  822. this.stateInDirectiveModifier(c)
  823. break
  824. }
  825. case State.InCommentLike: {
  826. this.stateInCommentLike(c)
  827. break
  828. }
  829. case State.InSpecialComment: {
  830. this.stateInSpecialComment(c)
  831. break
  832. }
  833. case State.BeforeAttributeName: {
  834. this.stateBeforeAttributeName(c)
  835. break
  836. }
  837. case State.InTagName: {
  838. this.stateInTagName(c)
  839. break
  840. }
  841. case State.InSFCRootTagName: {
  842. this.stateInSFCRootTagName(c)
  843. break
  844. }
  845. case State.InClosingTagName: {
  846. this.stateInClosingTagName(c)
  847. break
  848. }
  849. case State.BeforeTagName: {
  850. this.stateBeforeTagName(c)
  851. break
  852. }
  853. case State.AfterAttributeName: {
  854. this.stateAfterAttributeName(c)
  855. break
  856. }
  857. case State.InAttributeValueSq: {
  858. this.stateInAttributeValueSingleQuotes(c)
  859. break
  860. }
  861. case State.BeforeAttributeValue: {
  862. this.stateBeforeAttributeValue(c)
  863. break
  864. }
  865. case State.BeforeClosingTagName: {
  866. this.stateBeforeClosingTagName(c)
  867. break
  868. }
  869. case State.AfterClosingTagName: {
  870. this.stateAfterClosingTagName(c)
  871. break
  872. }
  873. case State.BeforeSpecialS: {
  874. this.stateBeforeSpecialS(c)
  875. break
  876. }
  877. case State.BeforeSpecialT: {
  878. this.stateBeforeSpecialT(c)
  879. break
  880. }
  881. case State.InAttributeValueNq: {
  882. this.stateInAttributeValueNoQuotes(c)
  883. break
  884. }
  885. case State.InSelfClosingTag: {
  886. this.stateInSelfClosingTag(c)
  887. break
  888. }
  889. case State.InDeclaration: {
  890. this.stateInDeclaration(c)
  891. break
  892. }
  893. case State.BeforeDeclaration: {
  894. this.stateBeforeDeclaration(c)
  895. break
  896. }
  897. case State.BeforeComment: {
  898. this.stateBeforeComment(c)
  899. break
  900. }
  901. case State.InProcessingInstruction: {
  902. this.stateInProcessingInstruction(c)
  903. break
  904. }
  905. case State.InEntity: {
  906. this.stateInEntity()
  907. break
  908. }
  909. }
  910. if (c === CharCodes.NewLine) {
  911. this.newlines.push(this.index)
  912. }
  913. this.index++
  914. }
  915. this.cleanup()
  916. this.finish()
  917. }
  918. /**
  919. * Remove data that has already been consumed from the buffer.
  920. */
  921. private cleanup() {
  922. // If we are inside of text or attributes, emit what we already have.
  923. if (this.sectionStart !== this.index) {
  924. if (
  925. this.state === State.Text ||
  926. (this.state === State.InSpecialTag && this.sequenceIndex === 0)
  927. ) {
  928. this.cbs.ontext(this.sectionStart, this.index)
  929. this.sectionStart = this.index
  930. } else if (
  931. this.state === State.InAttributeValueDq ||
  932. this.state === State.InAttributeValueSq ||
  933. this.state === State.InAttributeValueNq
  934. ) {
  935. this.cbs.onattribdata(this.sectionStart, this.index)
  936. this.sectionStart = this.index
  937. }
  938. }
  939. }
  940. private finish() {
  941. if (this.state === State.InEntity) {
  942. this.entityDecoder.end()
  943. this.state = this.baseState
  944. }
  945. this.handleTrailingData()
  946. this.cbs.onend()
  947. }
  948. /** Handle any trailing data. */
  949. private handleTrailingData() {
  950. const endIndex = this.buffer.length
  951. // If there is no remaining data, we are done.
  952. if (this.sectionStart >= endIndex) {
  953. return
  954. }
  955. if (this.state === State.InCommentLike) {
  956. if (this.currentSequence === Sequences.CdataEnd) {
  957. this.cbs.oncdata(this.sectionStart, endIndex)
  958. } else {
  959. this.cbs.oncomment(this.sectionStart, endIndex)
  960. }
  961. } else if (
  962. this.state === State.InTagName ||
  963. this.state === State.BeforeAttributeName ||
  964. this.state === State.BeforeAttributeValue ||
  965. this.state === State.AfterAttributeName ||
  966. this.state === State.InAttributeName ||
  967. this.state === State.InDirectiveName ||
  968. this.state === State.InDirectiveArg ||
  969. this.state === State.InDirectiveDynamicArg ||
  970. this.state === State.InDirectiveModifier ||
  971. this.state === State.InAttributeValueSq ||
  972. this.state === State.InAttributeValueDq ||
  973. this.state === State.InAttributeValueNq ||
  974. this.state === State.InClosingTagName
  975. ) {
  976. /*
  977. * If we are currently in an opening or closing tag, us not calling the
  978. * respective callback signals that the tag should be ignored.
  979. */
  980. } else {
  981. this.cbs.ontext(this.sectionStart, endIndex)
  982. }
  983. }
  984. private emitCodePoint(cp: number, consumed: number): void {
  985. if (
  986. this.baseState !== State.Text &&
  987. this.baseState !== State.InSpecialTag
  988. ) {
  989. if (this.sectionStart < this.entityStart) {
  990. this.cbs.onattribdata(this.sectionStart, this.entityStart)
  991. }
  992. this.sectionStart = this.entityStart + consumed
  993. this.index = this.sectionStart - 1
  994. this.cbs.onattribentity(cp)
  995. } else {
  996. if (this.sectionStart < this.entityStart) {
  997. this.cbs.ontext(this.sectionStart, this.entityStart)
  998. }
  999. this.sectionStart = this.entityStart + consumed
  1000. this.index = this.sectionStart - 1
  1001. this.cbs.ontextentity(cp, this.sectionStart)
  1002. }
  1003. }
  1004. }