Tokenizer.ts 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047
  1. /**
  2. * This Tokenizer is adapted from htmlparser2 under the MIT License listed at
  3. * https://github.com/fb55/htmlparser2/blob/master/LICENSE
  4. Copyright 2010, 2011, Chris Winberry <chris@winberry.net>. All rights reserved.
  5. Permission is hereby granted, free of charge, to any person obtaining a copy
  6. of this software and associated documentation files (the "Software"), to
  7. deal in the Software without restriction, including without limitation the
  8. rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  9. sell copies of the Software, and to permit persons to whom the Software is
  10. furnished to do so, subject to the following conditions:
  11. The above copyright notice and this permission notice shall be included in
  12. all copies or substantial portions of the Software.
  13. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  18. FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  19. IN THE SOFTWARE.
  20. */
  21. import {
  22. EntityDecoder,
  23. DecodingMode,
  24. htmlDecodeTree
  25. } from 'entities/lib/decode.js'
  26. import { ElementNode, Position } from '../ast'
  27. export const enum ParseMode {
  28. BASE,
  29. HTML,
  30. SFC
  31. }
  32. export const enum CharCodes {
  33. Tab = 0x9, // "\t"
  34. NewLine = 0xa, // "\n"
  35. FormFeed = 0xc, // "\f"
  36. CarriageReturn = 0xd, // "\r"
  37. Space = 0x20, // " "
  38. ExclamationMark = 0x21, // "!"
  39. Number = 0x23, // "#"
  40. Amp = 0x26, // "&"
  41. SingleQuote = 0x27, // "'"
  42. DoubleQuote = 0x22, // '"'
  43. Dash = 0x2d, // "-"
  44. Slash = 0x2f, // "/"
  45. Zero = 0x30, // "0"
  46. Nine = 0x39, // "9"
  47. Semi = 0x3b, // ";"
  48. Lt = 0x3c, // "<"
  49. Eq = 0x3d, // "="
  50. Gt = 0x3e, // ">"
  51. Questionmark = 0x3f, // "?"
  52. UpperA = 0x41, // "A"
  53. LowerA = 0x61, // "a"
  54. UpperF = 0x46, // "F"
  55. LowerF = 0x66, // "f"
  56. UpperZ = 0x5a, // "Z"
  57. LowerZ = 0x7a, // "z"
  58. LowerX = 0x78, // "x"
  59. OpeningSquareBracket = 0x5b, // "["
  60. LowerV = 0x76, // "v"
  61. Dot = 0x2e, // "."
  62. Colon = 0x3a, // ":"
  63. At = 0x40, // "@"
  64. LeftSqaure = 91, // "["
  65. RightSquare = 93 // "]"
  66. }
  67. const defaultDelimitersOpen = new Uint8Array([123, 123]) // "{{"
  68. const defaultDelimitersClose = new Uint8Array([125, 125]) // "}}"
  69. /** All the states the tokenizer can be in. */
  70. const enum State {
  71. Text = 1,
  72. Interpolation,
  73. // Tags
  74. BeforeTagName, // After <
  75. InTagName,
  76. InSelfClosingTag,
  77. BeforeClosingTagName,
  78. InClosingTagName,
  79. AfterClosingTagName,
  80. // Attributes
  81. BeforeAttributeName,
  82. InAttributeName,
  83. InDirectiveName,
  84. InDirectiveArg,
  85. InDirectiveDynamicArg,
  86. InDirectiveModifier,
  87. AfterAttributeName,
  88. BeforeAttributeValue,
  89. InAttributeValueDq, // "
  90. InAttributeValueSq, // '
  91. InAttributeValueNq,
  92. // Declarations
  93. BeforeDeclaration, // !
  94. InDeclaration,
  95. // Processing instructions
  96. InProcessingInstruction, // ?
  97. // Comments & CDATA
  98. BeforeComment,
  99. CDATASequence,
  100. InSpecialComment,
  101. InCommentLike,
  102. // Special tags
  103. BeforeSpecialS, // Decide if we deal with `<script` or `<style`
  104. BeforeSpecialT, // Decide if we deal with `<title` or `<textarea`
  105. SpecialStartSequence,
  106. InSpecialTag,
  107. InEntity,
  108. InSFCRootTagName
  109. }
  110. /**
  111. * HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a
  112. * tag name.
  113. */
  114. function isTagStartChar(c: number): boolean {
  115. return (
  116. (c >= CharCodes.LowerA && c <= CharCodes.LowerZ) ||
  117. (c >= CharCodes.UpperA && c <= CharCodes.UpperZ)
  118. )
  119. }
  120. export function isWhitespace(c: number): boolean {
  121. return (
  122. c === CharCodes.Space ||
  123. c === CharCodes.NewLine ||
  124. c === CharCodes.Tab ||
  125. c === CharCodes.FormFeed ||
  126. c === CharCodes.CarriageReturn
  127. )
  128. }
  129. function isEndOfTagSection(c: number): boolean {
  130. return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c)
  131. }
  132. export function toCharCodes(str: string): Uint8Array {
  133. const ret = new Uint8Array(str.length)
  134. for (let i = 0; i < str.length; i++) {
  135. ret[i] = str.charCodeAt(i)
  136. }
  137. return ret
  138. }
  139. export enum QuoteType {
  140. NoValue = 0,
  141. Unquoted = 1,
  142. Single = 2,
  143. Double = 3
  144. }
  145. export interface Callbacks {
  146. ontext(start: number, endIndex: number): void
  147. ontextentity(codepoint: number, endIndex: number): void
  148. oninterpolation(start: number, endIndex: number): void
  149. onopentagname(start: number, endIndex: number): void
  150. onopentagend(endIndex: number): void
  151. onselfclosingtag(endIndex: number): void
  152. onclosetag(start: number, endIndex: number): void
  153. onattribdata(start: number, endIndex: number): void
  154. onattribentity(codepoint: number): void
  155. onattribend(quote: QuoteType, endIndex: number): void
  156. onattribname(start: number, endIndex: number): void
  157. onattribnameend(endIndex: number): void
  158. ondirname(start: number, endIndex: number): void
  159. ondirarg(start: number, endIndex: number): void
  160. ondirmodifier(start: number, endIndex: number): void
  161. oncomment(start: number, endIndex: number): void
  162. oncdata(start: number, endIndex: number): void
  163. // onprocessinginstruction(start: number, endIndex: number): void
  164. // ondeclaration(start: number, endIndex: number): void
  165. onend(): void
  166. }
  167. /**
  168. * Sequences used to match longer strings.
  169. *
  170. * We don't have `Script`, `Style`, or `Title` here. Instead, we re-use the *End
  171. * sequences with an increased offset.
  172. */
  173. const Sequences = {
  174. Cdata: new Uint8Array([0x43, 0x44, 0x41, 0x54, 0x41, 0x5b]), // CDATA[
  175. CdataEnd: new Uint8Array([0x5d, 0x5d, 0x3e]), // ]]>
  176. CommentEnd: new Uint8Array([0x2d, 0x2d, 0x3e]), // `-->`
  177. ScriptEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74]), // `</script`
  178. StyleEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65]), // `</style`
  179. TitleEnd: new Uint8Array([0x3c, 0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65]), // `</title`
  180. TextareaEnd: new Uint8Array([
  181. 0x3c, 0x2f, 116, 101, 120, 116, 97, 114, 101, 97
  182. ]) // `</textarea
  183. }
  184. export default class Tokenizer {
  185. /** The current state the tokenizer is in. */
  186. private state = State.Text
  187. /** The read buffer. */
  188. private buffer = ''
  189. /** The beginning of the section that is currently being read. */
  190. private sectionStart = 0
  191. /** The index within the buffer that we are currently looking at. */
  192. private index = 0
  193. /** The start of the last entity. */
  194. private entityStart = 0
  195. /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
  196. private baseState = State.Text
  197. /** For special parsing behavior inside of script and style tags. */
  198. private isSpecial = false
  199. /** Reocrd newline positions for fast line / column calculation */
  200. private newlines: number[] = []
  201. private readonly entityDecoder: EntityDecoder
  202. constructor(
  203. private readonly stack: ElementNode[],
  204. private readonly cbs: Callbacks
  205. ) {
  206. this.entityDecoder = new EntityDecoder(htmlDecodeTree, (cp, consumed) =>
  207. this.emitCodePoint(cp, consumed)
  208. )
  209. }
  210. public mode = ParseMode.BASE
  211. public reset(): void {
  212. this.state = State.Text
  213. this.mode = ParseMode.BASE
  214. this.buffer = ''
  215. this.sectionStart = 0
  216. this.index = 0
  217. this.baseState = State.Text
  218. this.currentSequence = undefined!
  219. this.newlines.length = 0
  220. this.delimiterOpen = defaultDelimitersOpen
  221. this.delimiterClose = defaultDelimitersClose
  222. }
  223. /**
  224. * Generate Position object with line / column information using recorded
  225. * newline positions. We know the index is always going to be an already
  226. * processed index, so all the newlines up to this index should have been
  227. * recorded.
  228. */
  229. public getPos(index: number): Position {
  230. let line = 1
  231. let column = index + 1
  232. for (let i = this.newlines.length - 1; i >= 0; i--) {
  233. const newlineIndex = this.newlines[i]
  234. if (index > newlineIndex) {
  235. line = i + 2
  236. column = index - newlineIndex
  237. break
  238. }
  239. }
  240. return {
  241. column,
  242. line,
  243. offset: index
  244. }
  245. }
  246. private stateText(c: number): void {
  247. if (c === CharCodes.Lt) {
  248. if (this.index > this.sectionStart) {
  249. this.cbs.ontext(this.sectionStart, this.index)
  250. }
  251. this.state = State.BeforeTagName
  252. this.sectionStart = this.index
  253. } else if (c === CharCodes.Amp) {
  254. this.startEntity()
  255. } else if (this.matchDelimiter(c, this.delimiterOpen)) {
  256. if (this.index > this.sectionStart) {
  257. this.cbs.ontext(this.sectionStart, this.index)
  258. }
  259. this.state = State.Interpolation
  260. this.sectionStart = this.index
  261. this.index += this.delimiterOpen.length
  262. }
  263. }
  264. public delimiterOpen: Uint8Array = defaultDelimitersOpen
  265. public delimiterClose: Uint8Array = defaultDelimitersClose
  266. private matchDelimiter(c: number, delimiter: Uint8Array): boolean {
  267. if (c === delimiter[0]) {
  268. const l = delimiter.length
  269. for (let i = 1; i < l; i++) {
  270. if (this.buffer.charCodeAt(this.index + i) !== delimiter[i]) {
  271. return false
  272. }
  273. }
  274. return true
  275. }
  276. return false
  277. }
  278. private stateInterpolation(c: number): void {
  279. if (this.matchDelimiter(c, this.delimiterClose)) {
  280. this.index += this.delimiterClose.length
  281. this.cbs.oninterpolation(this.sectionStart, this.index)
  282. this.state = State.Text
  283. this.sectionStart = this.index
  284. this.stateText(this.buffer.charCodeAt(this.index))
  285. }
  286. }
  287. private currentSequence: Uint8Array = undefined!
  288. private sequenceIndex = 0
  289. private stateSpecialStartSequence(c: number): void {
  290. const isEnd = this.sequenceIndex === this.currentSequence.length
  291. const isMatch = isEnd
  292. ? // If we are at the end of the sequence, make sure the tag name has ended
  293. isEndOfTagSection(c)
  294. : // Otherwise, do a case-insensitive comparison
  295. (c | 0x20) === this.currentSequence[this.sequenceIndex]
  296. if (!isMatch) {
  297. this.isSpecial = false
  298. } else if (!isEnd) {
  299. this.sequenceIndex++
  300. return
  301. }
  302. this.sequenceIndex = 0
  303. this.state = State.InTagName
  304. this.stateInTagName(c)
  305. }
  306. /** Look for an end tag. For <title> and <textarea>, also decode entities. */
  307. private stateInSpecialTag(c: number): void {
  308. if (this.sequenceIndex === this.currentSequence.length) {
  309. if (c === CharCodes.Gt || isWhitespace(c)) {
  310. const endOfText = this.index - this.currentSequence.length
  311. if (this.sectionStart < endOfText) {
  312. // Spoof the index so that reported locations match up.
  313. const actualIndex = this.index
  314. this.index = endOfText
  315. this.cbs.ontext(this.sectionStart, endOfText)
  316. this.index = actualIndex
  317. }
  318. this.isSpecial = false
  319. this.sectionStart = endOfText + 2 // Skip over the `</`
  320. this.stateInClosingTagName(c)
  321. return // We are done; skip the rest of the function.
  322. }
  323. this.sequenceIndex = 0
  324. }
  325. if ((c | 0x20) === this.currentSequence[this.sequenceIndex]) {
  326. this.sequenceIndex += 1
  327. } else if (this.sequenceIndex === 0) {
  328. if (
  329. this.currentSequence === Sequences.TitleEnd ||
  330. (this.currentSequence === Sequences.TextareaEnd &&
  331. !(this.mode === ParseMode.SFC && this.stack.length === 0))
  332. ) {
  333. // We have to parse entities in <title> and <textarea> tags.
  334. if (c === CharCodes.Amp) {
  335. this.startEntity()
  336. }
  337. } else if (this.fastForwardTo(CharCodes.Lt)) {
  338. // Outside of <title> and <textarea> tags, we can fast-forward.
  339. this.sequenceIndex = 1
  340. }
  341. } else {
  342. // If we see a `<`, set the sequence index to 1; useful for eg. `<</script>`.
  343. this.sequenceIndex = Number(c === CharCodes.Lt)
  344. }
  345. }
  346. private stateCDATASequence(c: number): void {
  347. if (c === Sequences.Cdata[this.sequenceIndex]) {
  348. if (++this.sequenceIndex === Sequences.Cdata.length) {
  349. this.state = State.InCommentLike
  350. this.currentSequence = Sequences.CdataEnd
  351. this.sequenceIndex = 0
  352. this.sectionStart = this.index + 1
  353. }
  354. } else {
  355. this.sequenceIndex = 0
  356. this.state = State.InDeclaration
  357. this.stateInDeclaration(c) // Reconsume the character
  358. }
  359. }
  360. /**
  361. * When we wait for one specific character, we can speed things up
  362. * by skipping through the buffer until we find it.
  363. *
  364. * @returns Whether the character was found.
  365. */
  366. private fastForwardTo(c: number): boolean {
  367. while (++this.index < this.buffer.length) {
  368. if (this.buffer.charCodeAt(this.index) === c) {
  369. return true
  370. }
  371. }
  372. /*
  373. * We increment the index at the end of the `parse` loop,
  374. * so set it to `buffer.length - 1` here.
  375. *
  376. * TODO: Refactor `parse` to increment index before calling states.
  377. */
  378. this.index = this.buffer.length - 1
  379. return false
  380. }
  381. /**
  382. * Comments and CDATA end with `-->` and `]]>`.
  383. *
  384. * Their common qualities are:
  385. * - Their end sequences have a distinct character they start with.
  386. * - That character is then repeated, so we have to check multiple repeats.
  387. * - All characters but the start character of the sequence can be skipped.
  388. */
  389. private stateInCommentLike(c: number): void {
  390. if (c === this.currentSequence[this.sequenceIndex]) {
  391. if (++this.sequenceIndex === this.currentSequence.length) {
  392. if (this.currentSequence === Sequences.CdataEnd) {
  393. this.cbs.oncdata(this.sectionStart, this.index - 2)
  394. } else {
  395. this.cbs.oncomment(this.sectionStart, this.index - 2)
  396. }
  397. this.sequenceIndex = 0
  398. this.sectionStart = this.index + 1
  399. this.state = State.Text
  400. }
  401. } else if (this.sequenceIndex === 0) {
  402. // Fast-forward to the first character of the sequence
  403. if (this.fastForwardTo(this.currentSequence[0])) {
  404. this.sequenceIndex = 1
  405. }
  406. } else if (c !== this.currentSequence[this.sequenceIndex - 1]) {
  407. // Allow long sequences, eg. --->, ]]]>
  408. this.sequenceIndex = 0
  409. }
  410. }
  411. private startSpecial(sequence: Uint8Array, offset: number) {
  412. this.isSpecial = true
  413. this.currentSequence = sequence
  414. this.sequenceIndex = offset
  415. this.state = State.SpecialStartSequence
  416. }
  417. private stateBeforeTagName(c: number): void {
  418. if (c === CharCodes.ExclamationMark) {
  419. this.state = State.BeforeDeclaration
  420. this.sectionStart = this.index + 1
  421. } else if (c === CharCodes.Questionmark) {
  422. this.state = State.InProcessingInstruction
  423. this.sectionStart = this.index + 1
  424. } else if (isTagStartChar(c)) {
  425. this.sectionStart = this.index
  426. if (this.mode === ParseMode.BASE) {
  427. // no special tags in base mode
  428. this.state = State.InTagName
  429. } else if (this.mode === ParseMode.SFC && this.stack.length === 0) {
  430. // SFC mode + root level
  431. // - everything except <template> is RAWTEXT
  432. // - <template> with lang other than html is also RAWTEXT
  433. this.state = State.InSFCRootTagName
  434. } else {
  435. // HTML mode
  436. // - <script>, <style> RAWTEXT
  437. // - <title>, <textarea> RCDATA
  438. const lower = c | 0x20
  439. if (lower === 116 /* t */) {
  440. this.state = State.BeforeSpecialT
  441. } else {
  442. this.state =
  443. lower === 115 /* s */ ? State.BeforeSpecialS : State.InTagName
  444. }
  445. }
  446. } else if (c === CharCodes.Slash) {
  447. this.state = State.BeforeClosingTagName
  448. } else {
  449. this.state = State.Text
  450. this.stateText(c)
  451. }
  452. }
  453. private stateInTagName(c: number): void {
  454. if (isEndOfTagSection(c)) {
  455. this.handleTagName(c)
  456. }
  457. }
  458. private stateInSFCRootTagName(c: number): void {
  459. if (isEndOfTagSection(c)) {
  460. const tag = this.buffer.slice(this.sectionStart, this.index)
  461. if (tag !== 'template') {
  462. this.isSpecial = true
  463. this.currentSequence = toCharCodes(`</` + tag)
  464. }
  465. this.handleTagName(c)
  466. }
  467. }
  468. private handleTagName(c: number) {
  469. this.cbs.onopentagname(this.sectionStart, this.index)
  470. this.sectionStart = -1
  471. this.state = State.BeforeAttributeName
  472. this.stateBeforeAttributeName(c)
  473. }
  474. private stateBeforeClosingTagName(c: number): void {
  475. if (isWhitespace(c)) {
  476. // Ignore
  477. } else if (c === CharCodes.Gt) {
  478. this.state = State.Text
  479. } else {
  480. this.state = isTagStartChar(c)
  481. ? State.InClosingTagName
  482. : State.InSpecialComment
  483. this.sectionStart = this.index
  484. }
  485. }
  486. private stateInClosingTagName(c: number): void {
  487. if (c === CharCodes.Gt || isWhitespace(c)) {
  488. this.cbs.onclosetag(this.sectionStart, this.index)
  489. this.sectionStart = -1
  490. this.state = State.AfterClosingTagName
  491. this.stateAfterClosingTagName(c)
  492. }
  493. }
  494. private stateAfterClosingTagName(c: number): void {
  495. // Skip everything until ">"
  496. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  497. this.state = State.Text
  498. this.sectionStart = this.index + 1
  499. }
  500. }
  501. private stateBeforeAttributeName(c: number): void {
  502. if (c === CharCodes.Gt) {
  503. this.cbs.onopentagend(this.index)
  504. if (this.isSpecial) {
  505. this.state = State.InSpecialTag
  506. this.sequenceIndex = 0
  507. } else {
  508. this.state = State.Text
  509. }
  510. this.sectionStart = this.index + 1
  511. } else if (c === CharCodes.Slash) {
  512. this.state = State.InSelfClosingTag
  513. } else if (!isWhitespace(c)) {
  514. this.handleAttributeStart(c)
  515. }
  516. }
  517. private handleAttributeStart(c: number) {
  518. if (
  519. c === CharCodes.LowerV &&
  520. this.buffer.charCodeAt(this.index + 1) === CharCodes.Dash
  521. ) {
  522. this.state = State.InDirectiveName
  523. this.sectionStart = this.index
  524. } else if (
  525. c === CharCodes.Dot ||
  526. c === CharCodes.Colon ||
  527. c === CharCodes.At ||
  528. c === CharCodes.Number
  529. ) {
  530. this.cbs.ondirname(this.index, this.index + 1)
  531. this.state = State.InDirectiveArg
  532. this.sectionStart = this.index + 1
  533. } else {
  534. this.state = State.InAttributeName
  535. this.sectionStart = this.index
  536. }
  537. }
  538. private stateInSelfClosingTag(c: number): void {
  539. if (c === CharCodes.Gt) {
  540. this.cbs.onselfclosingtag(this.index)
  541. this.state = State.Text
  542. this.sectionStart = this.index + 1
  543. this.isSpecial = false // Reset special state, in case of self-closing special tags
  544. } else if (!isWhitespace(c)) {
  545. this.state = State.BeforeAttributeName
  546. this.stateBeforeAttributeName(c)
  547. }
  548. }
  549. private stateInAttributeName(c: number): void {
  550. if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  551. this.cbs.onattribname(this.sectionStart, this.index)
  552. this.handleAttributeNameEnd(c)
  553. }
  554. }
  555. private stateInDirectiveName(c: number): void {
  556. if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  557. this.cbs.ondirname(this.sectionStart, this.index)
  558. this.handleAttributeNameEnd(c)
  559. } else if (c === CharCodes.Colon) {
  560. this.cbs.ondirname(this.sectionStart, this.index)
  561. this.state = State.InDirectiveArg
  562. this.sectionStart = this.index + 1
  563. } else if (c === CharCodes.Dot) {
  564. this.cbs.ondirname(this.sectionStart, this.index)
  565. this.state = State.InDirectiveModifier
  566. this.sectionStart = this.index + 1
  567. }
  568. }
  569. private stateInDirectiveArg(c: number): void {
  570. if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  571. this.cbs.ondirarg(this.sectionStart, this.index)
  572. this.handleAttributeNameEnd(c)
  573. } else if (c === CharCodes.LeftSqaure) {
  574. this.state = State.InDirectiveDynamicArg
  575. } else if (c === CharCodes.Dot) {
  576. this.cbs.ondirarg(this.sectionStart, this.index)
  577. this.state = State.InDirectiveModifier
  578. this.sectionStart = this.index + 1
  579. }
  580. }
  581. private stateInDynamicDirectiveArg(c: number): void {
  582. if (c === CharCodes.RightSquare) {
  583. this.state = State.InDirectiveArg
  584. } else if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  585. // TODO emit error
  586. }
  587. }
  588. private stateInDirectiveModifier(c: number): void {
  589. if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  590. this.cbs.ondirmodifier(this.sectionStart, this.index)
  591. this.handleAttributeNameEnd(c)
  592. } else if (c === CharCodes.Dot) {
  593. this.cbs.ondirmodifier(this.sectionStart, this.index)
  594. this.sectionStart = this.index + 1
  595. }
  596. }
  597. private handleAttributeNameEnd(c: number): void {
  598. this.sectionStart = this.index
  599. this.state = State.AfterAttributeName
  600. this.cbs.onattribnameend(this.index)
  601. this.stateAfterAttributeName(c)
  602. }
  603. private stateAfterAttributeName(c: number): void {
  604. if (c === CharCodes.Eq) {
  605. this.state = State.BeforeAttributeValue
  606. } else if (c === CharCodes.Slash || c === CharCodes.Gt) {
  607. this.cbs.onattribend(QuoteType.NoValue, this.sectionStart)
  608. this.sectionStart = -1
  609. this.state = State.BeforeAttributeName
  610. this.stateBeforeAttributeName(c)
  611. } else if (!isWhitespace(c)) {
  612. this.cbs.onattribend(QuoteType.NoValue, this.sectionStart)
  613. this.handleAttributeStart(c)
  614. }
  615. }
  616. private stateBeforeAttributeValue(c: number): void {
  617. if (c === CharCodes.DoubleQuote) {
  618. this.state = State.InAttributeValueDq
  619. this.sectionStart = this.index + 1
  620. } else if (c === CharCodes.SingleQuote) {
  621. this.state = State.InAttributeValueSq
  622. this.sectionStart = this.index + 1
  623. } else if (!isWhitespace(c)) {
  624. this.sectionStart = this.index
  625. this.state = State.InAttributeValueNq
  626. this.stateInAttributeValueNoQuotes(c) // Reconsume token
  627. }
  628. }
  629. private handleInAttributeValue(c: number, quote: number) {
  630. if (c === quote) {
  631. this.cbs.onattribdata(this.sectionStart, this.index)
  632. this.sectionStart = -1
  633. this.cbs.onattribend(
  634. quote === CharCodes.DoubleQuote ? QuoteType.Double : QuoteType.Single,
  635. this.index + 1
  636. )
  637. this.state = State.BeforeAttributeName
  638. } else if (c === CharCodes.Amp) {
  639. this.startEntity()
  640. }
  641. }
  642. private stateInAttributeValueDoubleQuotes(c: number): void {
  643. this.handleInAttributeValue(c, CharCodes.DoubleQuote)
  644. }
  645. private stateInAttributeValueSingleQuotes(c: number): void {
  646. this.handleInAttributeValue(c, CharCodes.SingleQuote)
  647. }
  648. private stateInAttributeValueNoQuotes(c: number): void {
  649. if (isWhitespace(c) || c === CharCodes.Gt) {
  650. this.cbs.onattribdata(this.sectionStart, this.index)
  651. this.sectionStart = -1
  652. this.cbs.onattribend(QuoteType.Unquoted, this.index)
  653. this.state = State.BeforeAttributeName
  654. this.stateBeforeAttributeName(c)
  655. } else if (c === CharCodes.Amp) {
  656. this.startEntity()
  657. }
  658. }
  659. private stateBeforeDeclaration(c: number): void {
  660. if (c === CharCodes.OpeningSquareBracket) {
  661. this.state = State.CDATASequence
  662. this.sequenceIndex = 0
  663. } else {
  664. this.state =
  665. c === CharCodes.Dash ? State.BeforeComment : State.InDeclaration
  666. }
  667. }
  668. private stateInDeclaration(c: number): void {
  669. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  670. // this.cbs.ondeclaration(this.sectionStart, this.index)
  671. this.state = State.Text
  672. this.sectionStart = this.index + 1
  673. }
  674. }
  675. private stateInProcessingInstruction(c: number): void {
  676. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  677. // this.cbs.onprocessinginstruction(this.sectionStart, this.index)
  678. this.state = State.Text
  679. this.sectionStart = this.index + 1
  680. }
  681. }
  682. private stateBeforeComment(c: number): void {
  683. if (c === CharCodes.Dash) {
  684. this.state = State.InCommentLike
  685. this.currentSequence = Sequences.CommentEnd
  686. // Allow short comments (eg. <!-->)
  687. this.sequenceIndex = 2
  688. this.sectionStart = this.index + 1
  689. } else {
  690. this.state = State.InDeclaration
  691. }
  692. }
  693. private stateInSpecialComment(c: number): void {
  694. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  695. this.cbs.oncomment(this.sectionStart, this.index)
  696. this.state = State.Text
  697. this.sectionStart = this.index + 1
  698. }
  699. }
  700. private stateBeforeSpecialS(c: number): void {
  701. const lower = c | 0x20
  702. if (lower === Sequences.ScriptEnd[3]) {
  703. this.startSpecial(Sequences.ScriptEnd, 4)
  704. } else if (lower === Sequences.StyleEnd[3]) {
  705. this.startSpecial(Sequences.StyleEnd, 4)
  706. } else {
  707. this.state = State.InTagName
  708. this.stateInTagName(c) // Consume the token again
  709. }
  710. }
  711. private stateBeforeSpecialT(c: number): void {
  712. const lower = c | 0x20
  713. if (lower === Sequences.TitleEnd[3]) {
  714. this.startSpecial(Sequences.TitleEnd, 4)
  715. } else if (lower === Sequences.TextareaEnd[3]) {
  716. this.startSpecial(Sequences.TextareaEnd, 4)
  717. } else {
  718. this.state = State.InTagName
  719. this.stateInTagName(c) // Consume the token again
  720. }
  721. }
  722. private startEntity() {
  723. this.baseState = this.state
  724. this.state = State.InEntity
  725. this.entityStart = this.index
  726. this.entityDecoder.startEntity(
  727. this.baseState === State.Text || this.baseState === State.InSpecialTag
  728. ? DecodingMode.Legacy
  729. : DecodingMode.Attribute
  730. )
  731. }
  732. private stateInEntity(): void {
  733. const length = this.entityDecoder.write(this.buffer, this.index)
  734. // If `length` is positive, we are done with the entity.
  735. if (length >= 0) {
  736. this.state = this.baseState
  737. if (length === 0) {
  738. this.index = this.entityStart
  739. }
  740. } else {
  741. // Mark buffer as consumed.
  742. this.index = this.buffer.length - 1
  743. }
  744. }
  745. /**
  746. * Iterates through the buffer, calling the function corresponding to the current state.
  747. *
  748. * States that are more likely to be hit are higher up, as a performance improvement.
  749. */
  750. public parse(input: string) {
  751. this.buffer = input
  752. while (this.index < this.buffer.length) {
  753. const c = this.buffer.charCodeAt(this.index)
  754. switch (this.state) {
  755. case State.Text: {
  756. this.stateText(c)
  757. break
  758. }
  759. case State.Interpolation: {
  760. this.stateInterpolation(c)
  761. break
  762. }
  763. case State.SpecialStartSequence: {
  764. this.stateSpecialStartSequence(c)
  765. break
  766. }
  767. case State.InSpecialTag: {
  768. this.stateInSpecialTag(c)
  769. break
  770. }
  771. case State.CDATASequence: {
  772. this.stateCDATASequence(c)
  773. break
  774. }
  775. case State.InAttributeValueDq: {
  776. this.stateInAttributeValueDoubleQuotes(c)
  777. break
  778. }
  779. case State.InAttributeName: {
  780. this.stateInAttributeName(c)
  781. break
  782. }
  783. case State.InDirectiveName: {
  784. this.stateInDirectiveName(c)
  785. break
  786. }
  787. case State.InDirectiveArg: {
  788. this.stateInDirectiveArg(c)
  789. break
  790. }
  791. case State.InDirectiveDynamicArg: {
  792. this.stateInDynamicDirectiveArg(c)
  793. break
  794. }
  795. case State.InDirectiveModifier: {
  796. this.stateInDirectiveModifier(c)
  797. break
  798. }
  799. case State.InCommentLike: {
  800. this.stateInCommentLike(c)
  801. break
  802. }
  803. case State.InSpecialComment: {
  804. this.stateInSpecialComment(c)
  805. break
  806. }
  807. case State.BeforeAttributeName: {
  808. this.stateBeforeAttributeName(c)
  809. break
  810. }
  811. case State.InTagName: {
  812. this.stateInTagName(c)
  813. break
  814. }
  815. case State.InSFCRootTagName: {
  816. this.stateInSFCRootTagName(c)
  817. break
  818. }
  819. case State.InClosingTagName: {
  820. this.stateInClosingTagName(c)
  821. break
  822. }
  823. case State.BeforeTagName: {
  824. this.stateBeforeTagName(c)
  825. break
  826. }
  827. case State.AfterAttributeName: {
  828. this.stateAfterAttributeName(c)
  829. break
  830. }
  831. case State.InAttributeValueSq: {
  832. this.stateInAttributeValueSingleQuotes(c)
  833. break
  834. }
  835. case State.BeforeAttributeValue: {
  836. this.stateBeforeAttributeValue(c)
  837. break
  838. }
  839. case State.BeforeClosingTagName: {
  840. this.stateBeforeClosingTagName(c)
  841. break
  842. }
  843. case State.AfterClosingTagName: {
  844. this.stateAfterClosingTagName(c)
  845. break
  846. }
  847. case State.BeforeSpecialS: {
  848. this.stateBeforeSpecialS(c)
  849. break
  850. }
  851. case State.BeforeSpecialT: {
  852. this.stateBeforeSpecialT(c)
  853. break
  854. }
  855. case State.InAttributeValueNq: {
  856. this.stateInAttributeValueNoQuotes(c)
  857. break
  858. }
  859. case State.InSelfClosingTag: {
  860. this.stateInSelfClosingTag(c)
  861. break
  862. }
  863. case State.InDeclaration: {
  864. this.stateInDeclaration(c)
  865. break
  866. }
  867. case State.BeforeDeclaration: {
  868. this.stateBeforeDeclaration(c)
  869. break
  870. }
  871. case State.BeforeComment: {
  872. this.stateBeforeComment(c)
  873. break
  874. }
  875. case State.InProcessingInstruction: {
  876. this.stateInProcessingInstruction(c)
  877. break
  878. }
  879. case State.InEntity: {
  880. this.stateInEntity()
  881. break
  882. }
  883. }
  884. if (c === CharCodes.NewLine) {
  885. this.newlines.push(this.index)
  886. }
  887. this.index++
  888. }
  889. this.cleanup()
  890. this.finish()
  891. }
  892. /**
  893. * Remove data that has already been consumed from the buffer.
  894. */
  895. private cleanup() {
  896. // If we are inside of text or attributes, emit what we already have.
  897. if (this.sectionStart !== this.index) {
  898. if (
  899. this.state === State.Text ||
  900. (this.state === State.InSpecialTag && this.sequenceIndex === 0)
  901. ) {
  902. this.cbs.ontext(this.sectionStart, this.index)
  903. this.sectionStart = this.index
  904. } else if (
  905. this.state === State.InAttributeValueDq ||
  906. this.state === State.InAttributeValueSq ||
  907. this.state === State.InAttributeValueNq
  908. ) {
  909. this.cbs.onattribdata(this.sectionStart, this.index)
  910. this.sectionStart = this.index
  911. }
  912. }
  913. }
  914. private finish() {
  915. if (this.state === State.InEntity) {
  916. this.entityDecoder.end()
  917. this.state = this.baseState
  918. }
  919. this.handleTrailingData()
  920. this.cbs.onend()
  921. }
  922. /** Handle any trailing data. */
  923. private handleTrailingData() {
  924. const endIndex = this.buffer.length
  925. // If there is no remaining data, we are done.
  926. if (this.sectionStart >= endIndex) {
  927. return
  928. }
  929. if (this.state === State.InCommentLike) {
  930. if (this.currentSequence === Sequences.CdataEnd) {
  931. this.cbs.oncdata(this.sectionStart, endIndex)
  932. } else {
  933. this.cbs.oncomment(this.sectionStart, endIndex)
  934. }
  935. } else if (
  936. this.state === State.InTagName ||
  937. this.state === State.BeforeAttributeName ||
  938. this.state === State.BeforeAttributeValue ||
  939. this.state === State.AfterAttributeName ||
  940. this.state === State.InAttributeName ||
  941. this.state === State.InDirectiveName ||
  942. this.state === State.InDirectiveArg ||
  943. this.state === State.InDirectiveDynamicArg ||
  944. this.state === State.InDirectiveModifier ||
  945. this.state === State.InAttributeValueSq ||
  946. this.state === State.InAttributeValueDq ||
  947. this.state === State.InAttributeValueNq ||
  948. this.state === State.InClosingTagName
  949. ) {
  950. /*
  951. * If we are currently in an opening or closing tag, us not calling the
  952. * respective callback signals that the tag should be ignored.
  953. */
  954. } else {
  955. this.cbs.ontext(this.sectionStart, endIndex)
  956. }
  957. }
  958. private emitCodePoint(cp: number, consumed: number): void {
  959. if (
  960. this.baseState !== State.Text &&
  961. this.baseState !== State.InSpecialTag
  962. ) {
  963. if (this.sectionStart < this.entityStart) {
  964. this.cbs.onattribdata(this.sectionStart, this.entityStart)
  965. }
  966. this.sectionStart = this.entityStart + consumed
  967. this.index = this.sectionStart - 1
  968. this.cbs.onattribentity(cp)
  969. } else {
  970. if (this.sectionStart < this.entityStart) {
  971. this.cbs.ontext(this.sectionStart, this.entityStart)
  972. }
  973. this.sectionStart = this.entityStart + consumed
  974. this.index = this.sectionStart - 1
  975. this.cbs.ontextentity(cp, this.sectionStart)
  976. }
  977. }
  978. }