Tokenizer.ts 30 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974
  1. /**
  2. * This Tokenizer is adapted from htmlparser2 under the MIT License listed at
  3. * https://github.com/fb55/htmlparser2/blob/master/LICENSE
  4. Copyright 2010, 2011, Chris Winberry <chris@winberry.net>. All rights reserved.
  5. Permission is hereby granted, free of charge, to any person obtaining a copy
  6. of this software and associated documentation files (the "Software"), to
  7. deal in the Software without restriction, including without limitation the
  8. rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  9. sell copies of the Software, and to permit persons to whom the Software is
  10. furnished to do so, subject to the following conditions:
  11. The above copyright notice and this permission notice shall be included in
  12. all copies or substantial portions of the Software.
  13. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  18. FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  19. IN THE SOFTWARE.
  20. */
  21. import {
  22. EntityDecoder,
  23. DecodingMode,
  24. htmlDecodeTree
  25. } from 'entities/lib/decode.js'
  26. import { Position } from '../ast'
  27. export const enum CharCodes {
  28. Tab = 0x9, // "\t"
  29. NewLine = 0xa, // "\n"
  30. FormFeed = 0xc, // "\f"
  31. CarriageReturn = 0xd, // "\r"
  32. Space = 0x20, // " "
  33. ExclamationMark = 0x21, // "!"
  34. Number = 0x23, // "#"
  35. Amp = 0x26, // "&"
  36. SingleQuote = 0x27, // "'"
  37. DoubleQuote = 0x22, // '"'
  38. Dash = 0x2d, // "-"
  39. Slash = 0x2f, // "/"
  40. Zero = 0x30, // "0"
  41. Nine = 0x39, // "9"
  42. Semi = 0x3b, // ";"
  43. Lt = 0x3c, // "<"
  44. Eq = 0x3d, // "="
  45. Gt = 0x3e, // ">"
  46. Questionmark = 0x3f, // "?"
  47. UpperA = 0x41, // "A"
  48. LowerA = 0x61, // "a"
  49. UpperF = 0x46, // "F"
  50. LowerF = 0x66, // "f"
  51. UpperZ = 0x5a, // "Z"
  52. LowerZ = 0x7a, // "z"
  53. LowerX = 0x78, // "x"
  54. OpeningSquareBracket = 0x5b, // "["
  55. LowerV = 0x76, // "v"
  56. Dot = 0x2e, // "."
  57. Colon = 0x3a, // ":"
  58. At = 0x40, // "@"
  59. LeftSqaure = 91, // "["
  60. RightSquare = 93 // "]"
  61. }
  62. const defaultDelimitersOpen = new Uint8Array([123, 123]) // "{{"
  63. const defaultDelimitersClose = new Uint8Array([125, 125]) // "}}"
  64. /** All the states the tokenizer can be in. */
  65. const enum State {
  66. Text = 1,
  67. Interpolation,
  68. // Tags
  69. BeforeTagName, // After <
  70. InTagName,
  71. InSelfClosingTag,
  72. BeforeClosingTagName,
  73. InClosingTagName,
  74. AfterClosingTagName,
  75. // Attributes
  76. BeforeAttributeName,
  77. InAttributeName,
  78. InDirectiveName,
  79. InDirectiveArg,
  80. InDirectiveDynamicArg,
  81. InDirectiveModifier,
  82. AfterAttributeName,
  83. BeforeAttributeValue,
  84. InAttributeValueDq, // "
  85. InAttributeValueSq, // '
  86. InAttributeValueNq,
  87. // Declarations
  88. BeforeDeclaration, // !
  89. InDeclaration,
  90. // Processing instructions
  91. InProcessingInstruction, // ?
  92. // Comments & CDATA
  93. BeforeComment,
  94. CDATASequence,
  95. InSpecialComment,
  96. InCommentLike,
  97. // Special tags
  98. BeforeSpecialS, // Decide if we deal with `<script` or `<style`
  99. SpecialStartSequence,
  100. InSpecialTag,
  101. InEntity
  102. }
  103. /**
  104. * HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a
  105. * tag name.
  106. */
  107. function isTagStartChar(c: number): boolean {
  108. return (
  109. (c >= CharCodes.LowerA && c <= CharCodes.LowerZ) ||
  110. (c >= CharCodes.UpperA && c <= CharCodes.UpperZ)
  111. )
  112. }
  113. export function isWhitespace(c: number): boolean {
  114. return (
  115. c === CharCodes.Space ||
  116. c === CharCodes.NewLine ||
  117. c === CharCodes.Tab ||
  118. c === CharCodes.FormFeed ||
  119. c === CharCodes.CarriageReturn
  120. )
  121. }
  122. function isEndOfTagSection(c: number): boolean {
  123. return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c)
  124. }
  125. export enum QuoteType {
  126. NoValue = 0,
  127. Unquoted = 1,
  128. Single = 2,
  129. Double = 3
  130. }
  131. export interface Callbacks {
  132. ontext(start: number, endIndex: number): void
  133. ontextentity(codepoint: number, endIndex: number): void
  134. oninterpolation(start: number, endIndex: number): void
  135. onopentagname(start: number, endIndex: number): void
  136. onopentagend(endIndex: number): void
  137. onselfclosingtag(endIndex: number): void
  138. onclosetag(start: number, endIndex: number): void
  139. onattribdata(start: number, endIndex: number): void
  140. onattribentity(codepoint: number): void
  141. onattribend(quote: QuoteType, endIndex: number): void
  142. onattribname(start: number, endIndex: number): void
  143. onattribnameend(endIndex: number): void
  144. ondirname(start: number, endIndex: number): void
  145. ondirarg(start: number, endIndex: number): void
  146. ondirmodifier(start: number, endIndex: number): void
  147. oncomment(start: number, endIndex: number): void
  148. oncdata(start: number, endIndex: number): void
  149. // onprocessinginstruction(start: number, endIndex: number): void
  150. // ondeclaration(start: number, endIndex: number): void
  151. onend(): void
  152. }
  153. /**
  154. * Sequences used to match longer strings.
  155. *
  156. * We don't have `Script`, `Style`, or `Title` here. Instead, we re-use the *End
  157. * sequences with an increased offset.
  158. */
  159. const Sequences = {
  160. Cdata: new Uint8Array([0x43, 0x44, 0x41, 0x54, 0x41, 0x5b]), // CDATA[
  161. CdataEnd: new Uint8Array([0x5d, 0x5d, 0x3e]), // ]]>
  162. CommentEnd: new Uint8Array([0x2d, 0x2d, 0x3e]), // `-->`
  163. ScriptEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74]), // `</script`
  164. StyleEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65]), // `</style`
  165. TitleEnd: new Uint8Array([0x3c, 0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65]) // `</title`
  166. }
  167. export default class Tokenizer {
  168. /** The current state the tokenizer is in. */
  169. private state = State.Text
  170. /** The read buffer. */
  171. private buffer = ''
  172. /** The beginning of the section that is currently being read. */
  173. private sectionStart = 0
  174. /** The index within the buffer that we are currently looking at. */
  175. private index = 0
  176. /** The start of the last entity. */
  177. private entityStart = 0
  178. /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
  179. private baseState = State.Text
  180. /** For special parsing behavior inside of script and style tags. */
  181. private isSpecial = false
  182. /** Reocrd newline positions for fast line / column calculation */
  183. private newlines: number[] = []
  184. private readonly entityDecoder: EntityDecoder
  185. constructor(private readonly cbs: Callbacks) {
  186. this.entityDecoder = new EntityDecoder(htmlDecodeTree, (cp, consumed) =>
  187. this.emitCodePoint(cp, consumed)
  188. )
  189. }
  190. public reset(): void {
  191. this.state = State.Text
  192. this.buffer = ''
  193. this.sectionStart = 0
  194. this.index = 0
  195. this.baseState = State.Text
  196. this.currentSequence = undefined!
  197. this.newlines.length = 0
  198. this.delimiterOpen = defaultDelimitersOpen
  199. this.delimiterClose = defaultDelimitersClose
  200. }
  201. /**
  202. * Generate Position object with line / column information using recorded
  203. * newline positions. We know the index is always going to be an already
  204. * processed index, so all the newlines up to this index should have been
  205. * recorded.
  206. */
  207. public getPos(index: number): Position {
  208. let line = 1
  209. let column = index + 1
  210. for (let i = this.newlines.length - 1; i >= 0; i--) {
  211. const newlineIndex = this.newlines[i]
  212. if (index > newlineIndex) {
  213. line = i + 2
  214. column = index - newlineIndex
  215. break
  216. }
  217. }
  218. return {
  219. column,
  220. line,
  221. offset: index
  222. }
  223. }
  224. private stateText(c: number): void {
  225. if (c === CharCodes.Lt) {
  226. if (this.index > this.sectionStart) {
  227. this.cbs.ontext(this.sectionStart, this.index)
  228. }
  229. this.state = State.BeforeTagName
  230. this.sectionStart = this.index
  231. } else if (c === CharCodes.Amp) {
  232. this.startEntity()
  233. } else if (this.matchDelimiter(c, this.delimiterOpen)) {
  234. if (this.index > this.sectionStart) {
  235. this.cbs.ontext(this.sectionStart, this.index)
  236. }
  237. this.state = State.Interpolation
  238. this.sectionStart = this.index
  239. this.index += this.delimiterOpen.length
  240. }
  241. }
  242. public delimiterOpen: Uint8Array = defaultDelimitersOpen
  243. public delimiterClose: Uint8Array = defaultDelimitersClose
  244. private matchDelimiter(c: number, delimiter: Uint8Array): boolean {
  245. if (c === delimiter[0]) {
  246. const l = delimiter.length
  247. for (let i = 1; i < l; i++) {
  248. if (this.buffer.charCodeAt(this.index + i) !== delimiter[i]) {
  249. return false
  250. }
  251. }
  252. return true
  253. }
  254. return false
  255. }
  256. private stateInterpolation(c: number): void {
  257. if (this.matchDelimiter(c, this.delimiterClose)) {
  258. this.index += this.delimiterClose.length
  259. this.cbs.oninterpolation(this.sectionStart, this.index)
  260. this.state = State.Text
  261. this.sectionStart = this.index
  262. this.stateText(this.buffer.charCodeAt(this.index))
  263. }
  264. }
  265. private currentSequence: Uint8Array = undefined!
  266. private sequenceIndex = 0
  267. private stateSpecialStartSequence(c: number): void {
  268. const isEnd = this.sequenceIndex === this.currentSequence.length
  269. const isMatch = isEnd
  270. ? // If we are at the end of the sequence, make sure the tag name has ended
  271. isEndOfTagSection(c)
  272. : // Otherwise, do a case-insensitive comparison
  273. (c | 0x20) === this.currentSequence[this.sequenceIndex]
  274. if (!isMatch) {
  275. this.isSpecial = false
  276. } else if (!isEnd) {
  277. this.sequenceIndex++
  278. return
  279. }
  280. this.sequenceIndex = 0
  281. this.state = State.InTagName
  282. this.stateInTagName(c)
  283. }
  284. /** Look for an end tag. For <title> tags, also decode entities. */
  285. private stateInSpecialTag(c: number): void {
  286. if (this.sequenceIndex === this.currentSequence.length) {
  287. if (c === CharCodes.Gt || isWhitespace(c)) {
  288. const endOfText = this.index - this.currentSequence.length
  289. if (this.sectionStart < endOfText) {
  290. // Spoof the index so that reported locations match up.
  291. const actualIndex = this.index
  292. this.index = endOfText
  293. this.cbs.ontext(this.sectionStart, endOfText)
  294. this.index = actualIndex
  295. }
  296. this.isSpecial = false
  297. this.sectionStart = endOfText + 2 // Skip over the `</`
  298. this.stateInClosingTagName(c)
  299. return // We are done; skip the rest of the function.
  300. }
  301. this.sequenceIndex = 0
  302. }
  303. if ((c | 0x20) === this.currentSequence[this.sequenceIndex]) {
  304. this.sequenceIndex += 1
  305. } else if (this.sequenceIndex === 0) {
  306. if (this.currentSequence === Sequences.TitleEnd) {
  307. // We have to parse entities in <title> tags.
  308. if (c === CharCodes.Amp) {
  309. this.startEntity()
  310. }
  311. } else if (this.fastForwardTo(CharCodes.Lt)) {
  312. // Outside of <title> tags, we can fast-forward.
  313. this.sequenceIndex = 1
  314. }
  315. } else {
  316. // If we see a `<`, set the sequence index to 1; useful for eg. `<</script>`.
  317. this.sequenceIndex = Number(c === CharCodes.Lt)
  318. }
  319. }
  320. private stateCDATASequence(c: number): void {
  321. if (c === Sequences.Cdata[this.sequenceIndex]) {
  322. if (++this.sequenceIndex === Sequences.Cdata.length) {
  323. this.state = State.InCommentLike
  324. this.currentSequence = Sequences.CdataEnd
  325. this.sequenceIndex = 0
  326. this.sectionStart = this.index + 1
  327. }
  328. } else {
  329. this.sequenceIndex = 0
  330. this.state = State.InDeclaration
  331. this.stateInDeclaration(c) // Reconsume the character
  332. }
  333. }
  334. /**
  335. * When we wait for one specific character, we can speed things up
  336. * by skipping through the buffer until we find it.
  337. *
  338. * @returns Whether the character was found.
  339. */
  340. private fastForwardTo(c: number): boolean {
  341. while (++this.index < this.buffer.length) {
  342. if (this.buffer.charCodeAt(this.index) === c) {
  343. return true
  344. }
  345. }
  346. /*
  347. * We increment the index at the end of the `parse` loop,
  348. * so set it to `buffer.length - 1` here.
  349. *
  350. * TODO: Refactor `parse` to increment index before calling states.
  351. */
  352. this.index = this.buffer.length - 1
  353. return false
  354. }
  355. /**
  356. * Comments and CDATA end with `-->` and `]]>`.
  357. *
  358. * Their common qualities are:
  359. * - Their end sequences have a distinct character they start with.
  360. * - That character is then repeated, so we have to check multiple repeats.
  361. * - All characters but the start character of the sequence can be skipped.
  362. */
  363. private stateInCommentLike(c: number): void {
  364. if (c === this.currentSequence[this.sequenceIndex]) {
  365. if (++this.sequenceIndex === this.currentSequence.length) {
  366. if (this.currentSequence === Sequences.CdataEnd) {
  367. this.cbs.oncdata(this.sectionStart, this.index - 2)
  368. } else {
  369. this.cbs.oncomment(this.sectionStart, this.index - 2)
  370. }
  371. this.sequenceIndex = 0
  372. this.sectionStart = this.index + 1
  373. this.state = State.Text
  374. }
  375. } else if (this.sequenceIndex === 0) {
  376. // Fast-forward to the first character of the sequence
  377. if (this.fastForwardTo(this.currentSequence[0])) {
  378. this.sequenceIndex = 1
  379. }
  380. } else if (c !== this.currentSequence[this.sequenceIndex - 1]) {
  381. // Allow long sequences, eg. --->, ]]]>
  382. this.sequenceIndex = 0
  383. }
  384. }
  385. private startSpecial(sequence: Uint8Array, offset: number) {
  386. this.isSpecial = true
  387. this.currentSequence = sequence
  388. this.sequenceIndex = offset
  389. this.state = State.SpecialStartSequence
  390. }
  391. private stateBeforeTagName(c: number): void {
  392. if (c === CharCodes.ExclamationMark) {
  393. this.state = State.BeforeDeclaration
  394. this.sectionStart = this.index + 1
  395. } else if (c === CharCodes.Questionmark) {
  396. this.state = State.InProcessingInstruction
  397. this.sectionStart = this.index + 1
  398. } else if (isTagStartChar(c)) {
  399. const lower = c | 0x20
  400. this.sectionStart = this.index
  401. if (lower === Sequences.TitleEnd[2]) {
  402. this.startSpecial(Sequences.TitleEnd, 3)
  403. } else {
  404. this.state =
  405. lower === Sequences.ScriptEnd[2]
  406. ? State.BeforeSpecialS
  407. : State.InTagName
  408. }
  409. } else if (c === CharCodes.Slash) {
  410. this.state = State.BeforeClosingTagName
  411. } else {
  412. this.state = State.Text
  413. this.stateText(c)
  414. }
  415. }
  416. private stateInTagName(c: number): void {
  417. if (isEndOfTagSection(c)) {
  418. this.cbs.onopentagname(this.sectionStart, this.index)
  419. this.sectionStart = -1
  420. this.state = State.BeforeAttributeName
  421. this.stateBeforeAttributeName(c)
  422. }
  423. }
  424. private stateBeforeClosingTagName(c: number): void {
  425. if (isWhitespace(c)) {
  426. // Ignore
  427. } else if (c === CharCodes.Gt) {
  428. this.state = State.Text
  429. } else {
  430. this.state = isTagStartChar(c)
  431. ? State.InClosingTagName
  432. : State.InSpecialComment
  433. this.sectionStart = this.index
  434. }
  435. }
  436. private stateInClosingTagName(c: number): void {
  437. if (c === CharCodes.Gt || isWhitespace(c)) {
  438. this.cbs.onclosetag(this.sectionStart, this.index)
  439. this.sectionStart = -1
  440. this.state = State.AfterClosingTagName
  441. this.stateAfterClosingTagName(c)
  442. }
  443. }
  444. private stateAfterClosingTagName(c: number): void {
  445. // Skip everything until ">"
  446. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  447. this.state = State.Text
  448. this.sectionStart = this.index + 1
  449. }
  450. }
  451. private stateBeforeAttributeName(c: number): void {
  452. if (c === CharCodes.Gt) {
  453. this.cbs.onopentagend(this.index)
  454. if (this.isSpecial) {
  455. this.state = State.InSpecialTag
  456. this.sequenceIndex = 0
  457. } else {
  458. this.state = State.Text
  459. }
  460. this.sectionStart = this.index + 1
  461. } else if (c === CharCodes.Slash) {
  462. this.state = State.InSelfClosingTag
  463. } else if (!isWhitespace(c)) {
  464. this.handleAttributeStart(c)
  465. }
  466. }
  467. private handleAttributeStart(c: number) {
  468. if (
  469. c === CharCodes.LowerV &&
  470. this.buffer.charCodeAt(this.index + 1) === CharCodes.Dash
  471. ) {
  472. this.state = State.InDirectiveName
  473. this.sectionStart = this.index
  474. } else if (
  475. c === CharCodes.Dot ||
  476. c === CharCodes.Colon ||
  477. c === CharCodes.At ||
  478. c === CharCodes.Number
  479. ) {
  480. this.cbs.ondirname(this.index, this.index + 1)
  481. this.state = State.InDirectiveArg
  482. this.sectionStart = this.index + 1
  483. } else {
  484. this.state = State.InAttributeName
  485. this.sectionStart = this.index
  486. }
  487. }
  488. private stateInSelfClosingTag(c: number): void {
  489. if (c === CharCodes.Gt) {
  490. this.cbs.onselfclosingtag(this.index)
  491. this.state = State.Text
  492. this.sectionStart = this.index + 1
  493. this.isSpecial = false // Reset special state, in case of self-closing special tags
  494. } else if (!isWhitespace(c)) {
  495. this.state = State.BeforeAttributeName
  496. this.stateBeforeAttributeName(c)
  497. }
  498. }
  499. private stateInAttributeName(c: number): void {
  500. if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  501. this.cbs.onattribname(this.sectionStart, this.index)
  502. this.handleAttributeNameEnd(c)
  503. }
  504. }
  505. private stateInDirectiveName(c: number): void {
  506. if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  507. this.cbs.ondirname(this.sectionStart, this.index)
  508. this.handleAttributeNameEnd(c)
  509. } else if (c === CharCodes.Colon) {
  510. this.cbs.ondirname(this.sectionStart, this.index)
  511. this.state = State.InDirectiveArg
  512. this.sectionStart = this.index + 1
  513. } else if (c === CharCodes.Dot) {
  514. this.cbs.ondirname(this.sectionStart, this.index)
  515. this.state = State.InDirectiveModifier
  516. this.sectionStart = this.index + 1
  517. }
  518. }
  519. private stateInDirectiveArg(c: number): void {
  520. if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  521. this.cbs.ondirarg(this.sectionStart, this.index)
  522. this.handleAttributeNameEnd(c)
  523. } else if (c === CharCodes.LeftSqaure) {
  524. this.state = State.InDirectiveDynamicArg
  525. } else if (c === CharCodes.Dot) {
  526. this.cbs.ondirarg(this.sectionStart, this.index)
  527. this.state = State.InDirectiveModifier
  528. this.sectionStart = this.index + 1
  529. }
  530. }
  531. private stateInDynamicDirectiveArg(c: number): void {
  532. if (c === CharCodes.RightSquare) {
  533. this.state = State.InDirectiveArg
  534. } else if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  535. // TODO emit error
  536. }
  537. }
  538. private stateInDirectiveModifier(c: number): void {
  539. if (c === CharCodes.Eq || isEndOfTagSection(c)) {
  540. this.cbs.ondirmodifier(this.sectionStart, this.index)
  541. this.handleAttributeNameEnd(c)
  542. } else if (c === CharCodes.Dot) {
  543. this.cbs.ondirmodifier(this.sectionStart, this.index)
  544. this.sectionStart = this.index + 1
  545. }
  546. }
  547. private handleAttributeNameEnd(c: number): void {
  548. this.sectionStart = this.index
  549. this.state = State.AfterAttributeName
  550. this.cbs.onattribnameend(this.index)
  551. this.stateAfterAttributeName(c)
  552. }
  553. private stateAfterAttributeName(c: number): void {
  554. if (c === CharCodes.Eq) {
  555. this.state = State.BeforeAttributeValue
  556. } else if (c === CharCodes.Slash || c === CharCodes.Gt) {
  557. this.cbs.onattribend(QuoteType.NoValue, this.sectionStart)
  558. this.sectionStart = -1
  559. this.state = State.BeforeAttributeName
  560. this.stateBeforeAttributeName(c)
  561. } else if (!isWhitespace(c)) {
  562. this.cbs.onattribend(QuoteType.NoValue, this.sectionStart)
  563. this.handleAttributeStart(c)
  564. }
  565. }
  566. private stateBeforeAttributeValue(c: number): void {
  567. if (c === CharCodes.DoubleQuote) {
  568. this.state = State.InAttributeValueDq
  569. this.sectionStart = this.index + 1
  570. } else if (c === CharCodes.SingleQuote) {
  571. this.state = State.InAttributeValueSq
  572. this.sectionStart = this.index + 1
  573. } else if (!isWhitespace(c)) {
  574. this.sectionStart = this.index
  575. this.state = State.InAttributeValueNq
  576. this.stateInAttributeValueNoQuotes(c) // Reconsume token
  577. }
  578. }
  579. private handleInAttributeValue(c: number, quote: number) {
  580. if (c === quote) {
  581. this.cbs.onattribdata(this.sectionStart, this.index)
  582. this.sectionStart = -1
  583. this.cbs.onattribend(
  584. quote === CharCodes.DoubleQuote ? QuoteType.Double : QuoteType.Single,
  585. this.index + 1
  586. )
  587. this.state = State.BeforeAttributeName
  588. } else if (c === CharCodes.Amp) {
  589. this.startEntity()
  590. }
  591. }
  592. private stateInAttributeValueDoubleQuotes(c: number): void {
  593. this.handleInAttributeValue(c, CharCodes.DoubleQuote)
  594. }
  595. private stateInAttributeValueSingleQuotes(c: number): void {
  596. this.handleInAttributeValue(c, CharCodes.SingleQuote)
  597. }
  598. private stateInAttributeValueNoQuotes(c: number): void {
  599. if (isWhitespace(c) || c === CharCodes.Gt) {
  600. this.cbs.onattribdata(this.sectionStart, this.index)
  601. this.sectionStart = -1
  602. this.cbs.onattribend(QuoteType.Unquoted, this.index)
  603. this.state = State.BeforeAttributeName
  604. this.stateBeforeAttributeName(c)
  605. } else if (c === CharCodes.Amp) {
  606. this.startEntity()
  607. }
  608. }
  609. private stateBeforeDeclaration(c: number): void {
  610. if (c === CharCodes.OpeningSquareBracket) {
  611. this.state = State.CDATASequence
  612. this.sequenceIndex = 0
  613. } else {
  614. this.state =
  615. c === CharCodes.Dash ? State.BeforeComment : State.InDeclaration
  616. }
  617. }
  618. private stateInDeclaration(c: number): void {
  619. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  620. // this.cbs.ondeclaration(this.sectionStart, this.index)
  621. this.state = State.Text
  622. this.sectionStart = this.index + 1
  623. }
  624. }
  625. private stateInProcessingInstruction(c: number): void {
  626. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  627. // this.cbs.onprocessinginstruction(this.sectionStart, this.index)
  628. this.state = State.Text
  629. this.sectionStart = this.index + 1
  630. }
  631. }
  632. private stateBeforeComment(c: number): void {
  633. if (c === CharCodes.Dash) {
  634. this.state = State.InCommentLike
  635. this.currentSequence = Sequences.CommentEnd
  636. // Allow short comments (eg. <!-->)
  637. this.sequenceIndex = 2
  638. this.sectionStart = this.index + 1
  639. } else {
  640. this.state = State.InDeclaration
  641. }
  642. }
  643. private stateInSpecialComment(c: number): void {
  644. if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
  645. this.cbs.oncomment(this.sectionStart, this.index)
  646. this.state = State.Text
  647. this.sectionStart = this.index + 1
  648. }
  649. }
  650. private stateBeforeSpecialS(c: number): void {
  651. const lower = c | 0x20
  652. if (lower === Sequences.ScriptEnd[3]) {
  653. this.startSpecial(Sequences.ScriptEnd, 4)
  654. } else if (lower === Sequences.StyleEnd[3]) {
  655. this.startSpecial(Sequences.StyleEnd, 4)
  656. } else {
  657. this.state = State.InTagName
  658. this.stateInTagName(c) // Consume the token again
  659. }
  660. }
  661. private startEntity() {
  662. this.baseState = this.state
  663. this.state = State.InEntity
  664. this.entityStart = this.index
  665. this.entityDecoder.startEntity(
  666. this.baseState === State.Text || this.baseState === State.InSpecialTag
  667. ? DecodingMode.Legacy
  668. : DecodingMode.Attribute
  669. )
  670. }
  671. private stateInEntity(): void {
  672. const length = this.entityDecoder.write(this.buffer, this.index)
  673. // If `length` is positive, we are done with the entity.
  674. if (length >= 0) {
  675. this.state = this.baseState
  676. if (length === 0) {
  677. this.index = this.entityStart
  678. }
  679. } else {
  680. // Mark buffer as consumed.
  681. this.index = this.buffer.length - 1
  682. }
  683. }
  684. /**
  685. * Iterates through the buffer, calling the function corresponding to the current state.
  686. *
  687. * States that are more likely to be hit are higher up, as a performance improvement.
  688. */
  689. public parse(input: string) {
  690. this.buffer = input
  691. while (this.index < this.buffer.length) {
  692. const c = this.buffer.charCodeAt(this.index)
  693. switch (this.state) {
  694. case State.Text: {
  695. this.stateText(c)
  696. break
  697. }
  698. case State.Interpolation: {
  699. this.stateInterpolation(c)
  700. break
  701. }
  702. case State.SpecialStartSequence: {
  703. this.stateSpecialStartSequence(c)
  704. break
  705. }
  706. case State.InSpecialTag: {
  707. this.stateInSpecialTag(c)
  708. break
  709. }
  710. case State.CDATASequence: {
  711. this.stateCDATASequence(c)
  712. break
  713. }
  714. case State.InAttributeValueDq: {
  715. this.stateInAttributeValueDoubleQuotes(c)
  716. break
  717. }
  718. case State.InAttributeName: {
  719. this.stateInAttributeName(c)
  720. break
  721. }
  722. case State.InDirectiveName: {
  723. this.stateInDirectiveName(c)
  724. break
  725. }
  726. case State.InDirectiveArg: {
  727. this.stateInDirectiveArg(c)
  728. break
  729. }
  730. case State.InDirectiveDynamicArg: {
  731. this.stateInDynamicDirectiveArg(c)
  732. break
  733. }
  734. case State.InDirectiveModifier: {
  735. this.stateInDirectiveModifier(c)
  736. break
  737. }
  738. case State.InCommentLike: {
  739. this.stateInCommentLike(c)
  740. break
  741. }
  742. case State.InSpecialComment: {
  743. this.stateInSpecialComment(c)
  744. break
  745. }
  746. case State.BeforeAttributeName: {
  747. this.stateBeforeAttributeName(c)
  748. break
  749. }
  750. case State.InTagName: {
  751. this.stateInTagName(c)
  752. break
  753. }
  754. case State.InClosingTagName: {
  755. this.stateInClosingTagName(c)
  756. break
  757. }
  758. case State.BeforeTagName: {
  759. this.stateBeforeTagName(c)
  760. break
  761. }
  762. case State.AfterAttributeName: {
  763. this.stateAfterAttributeName(c)
  764. break
  765. }
  766. case State.InAttributeValueSq: {
  767. this.stateInAttributeValueSingleQuotes(c)
  768. break
  769. }
  770. case State.BeforeAttributeValue: {
  771. this.stateBeforeAttributeValue(c)
  772. break
  773. }
  774. case State.BeforeClosingTagName: {
  775. this.stateBeforeClosingTagName(c)
  776. break
  777. }
  778. case State.AfterClosingTagName: {
  779. this.stateAfterClosingTagName(c)
  780. break
  781. }
  782. case State.BeforeSpecialS: {
  783. this.stateBeforeSpecialS(c)
  784. break
  785. }
  786. case State.InAttributeValueNq: {
  787. this.stateInAttributeValueNoQuotes(c)
  788. break
  789. }
  790. case State.InSelfClosingTag: {
  791. this.stateInSelfClosingTag(c)
  792. break
  793. }
  794. case State.InDeclaration: {
  795. this.stateInDeclaration(c)
  796. break
  797. }
  798. case State.BeforeDeclaration: {
  799. this.stateBeforeDeclaration(c)
  800. break
  801. }
  802. case State.BeforeComment: {
  803. this.stateBeforeComment(c)
  804. break
  805. }
  806. case State.InProcessingInstruction: {
  807. this.stateInProcessingInstruction(c)
  808. break
  809. }
  810. case State.InEntity: {
  811. this.stateInEntity()
  812. break
  813. }
  814. }
  815. if (c === CharCodes.NewLine) {
  816. this.newlines.push(this.index)
  817. }
  818. this.index++
  819. }
  820. this.cleanup()
  821. this.finish()
  822. }
  823. /**
  824. * Remove data that has already been consumed from the buffer.
  825. */
  826. private cleanup() {
  827. // If we are inside of text or attributes, emit what we already have.
  828. if (this.sectionStart !== this.index) {
  829. if (
  830. this.state === State.Text ||
  831. (this.state === State.InSpecialTag && this.sequenceIndex === 0)
  832. ) {
  833. this.cbs.ontext(this.sectionStart, this.index)
  834. this.sectionStart = this.index
  835. } else if (
  836. this.state === State.InAttributeValueDq ||
  837. this.state === State.InAttributeValueSq ||
  838. this.state === State.InAttributeValueNq
  839. ) {
  840. this.cbs.onattribdata(this.sectionStart, this.index)
  841. this.sectionStart = this.index
  842. }
  843. }
  844. }
  845. private finish() {
  846. if (this.state === State.InEntity) {
  847. this.entityDecoder.end()
  848. this.state = this.baseState
  849. }
  850. this.handleTrailingData()
  851. this.cbs.onend()
  852. }
  853. /** Handle any trailing data. */
  854. private handleTrailingData() {
  855. const endIndex = this.buffer.length
  856. // If there is no remaining data, we are done.
  857. if (this.sectionStart >= endIndex) {
  858. return
  859. }
  860. if (this.state === State.InCommentLike) {
  861. if (this.currentSequence === Sequences.CdataEnd) {
  862. this.cbs.oncdata(this.sectionStart, endIndex)
  863. } else {
  864. this.cbs.oncomment(this.sectionStart, endIndex)
  865. }
  866. } else if (
  867. this.state === State.InTagName ||
  868. this.state === State.BeforeAttributeName ||
  869. this.state === State.BeforeAttributeValue ||
  870. this.state === State.AfterAttributeName ||
  871. this.state === State.InAttributeName ||
  872. this.state === State.InDirectiveName ||
  873. this.state === State.InDirectiveArg ||
  874. this.state === State.InDirectiveDynamicArg ||
  875. this.state === State.InDirectiveModifier ||
  876. this.state === State.InAttributeValueSq ||
  877. this.state === State.InAttributeValueDq ||
  878. this.state === State.InAttributeValueNq ||
  879. this.state === State.InClosingTagName
  880. ) {
  881. /*
  882. * If we are currently in an opening or closing tag, us not calling the
  883. * respective callback signals that the tag should be ignored.
  884. */
  885. } else {
  886. this.cbs.ontext(this.sectionStart, endIndex)
  887. }
  888. }
  889. private emitCodePoint(cp: number, consumed: number): void {
  890. if (
  891. this.baseState !== State.Text &&
  892. this.baseState !== State.InSpecialTag
  893. ) {
  894. if (this.sectionStart < this.entityStart) {
  895. this.cbs.onattribdata(this.sectionStart, this.entityStart)
  896. }
  897. this.sectionStart = this.entityStart + consumed
  898. this.index = this.sectionStart - 1
  899. this.cbs.onattribentity(cp)
  900. } else {
  901. if (this.sectionStart < this.entityStart) {
  902. this.cbs.ontext(this.sectionStart, this.entityStart)
  903. }
  904. this.sectionStart = this.entityStart + consumed
  905. this.index = this.sectionStart - 1
  906. this.cbs.ontextentity(cp, this.sectionStart)
  907. }
  908. }
  909. }