lexer.go 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929
  1. // Copyright 2016 PingCAP, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // See the License for the specific language governing permissions and
  12. // limitations under the License.
  13. package parser
  14. import (
  15. "bytes"
  16. "fmt"
  17. "strconv"
  18. "strings"
  19. "unicode"
  20. "unicode/utf8"
  21. "github.com/pingcap/parser/mysql"
  22. )
  23. var _ = yyLexer(&Scanner{})
  24. // Pos represents the position of a token.
  25. type Pos struct {
  26. Line int
  27. Col int
  28. Offset int
  29. }
  30. // Scanner implements the yyLexer interface.
  31. type Scanner struct {
  32. r reader
  33. buf bytes.Buffer
  34. errs []error
  35. warns []error
  36. stmtStartPos int
  37. // inBangComment is true if we are inside a `/*! ... */` block.
  38. // It is used to ignore a stray `*/` when scanning.
  39. inBangComment bool
  40. sqlMode mysql.SQLMode
  41. // If the lexer should recognize keywords for window function.
  42. // It may break the compatibility when support those keywords,
  43. // because some application may already use them as identifiers.
  44. supportWindowFunc bool
  45. // lastScanOffset indicates last offset returned by scan().
  46. // It's used to substring sql in syntax error message.
  47. lastScanOffset int
  48. // lastKeyword records the previous keyword returned by scan().
  49. // determine whether an optimizer hint should be parsed or ignored.
  50. lastKeyword int
  51. // lastKeyword2 records the keyword before lastKeyword, it is used
  52. // to disambiguate hint after for update, which should be ignored.
  53. lastKeyword2 int
  54. // lastKeyword3 records the keyword before lastKeyword2, it is used
  55. // to disambiguate hint after create binding for update, which should
  56. // be pertained.
  57. lastKeyword3 int
  58. // hintPos records the start position of the previous optimizer hint.
  59. lastHintPos Pos
  60. // true if a dot follows an identifier
  61. identifierDot bool
  62. }
  63. // Errors returns the errors and warns during a scan.
  64. func (s *Scanner) Errors() (warns []error, errs []error) {
  65. return s.warns, s.errs
  66. }
  67. // reset resets the sql string to be scanned.
  68. func (s *Scanner) reset(sql string) {
  69. s.r = reader{s: sql, p: Pos{Line: 1}}
  70. s.buf.Reset()
  71. s.errs = s.errs[:0]
  72. s.warns = s.warns[:0]
  73. s.stmtStartPos = 0
  74. s.inBangComment = false
  75. s.lastKeyword = 0
  76. }
  77. func (s *Scanner) stmtText() string {
  78. endPos := s.r.pos().Offset
  79. if s.r.s[endPos-1] == '\n' {
  80. endPos = endPos - 1 // trim new line
  81. }
  82. if s.r.s[s.stmtStartPos] == '\n' {
  83. s.stmtStartPos++
  84. }
  85. text := s.r.s[s.stmtStartPos:endPos]
  86. s.stmtStartPos = endPos
  87. return text
  88. }
  89. // Errorf tells scanner something is wrong.
  90. // Scanner satisfies yyLexer interface which need this function.
  91. func (s *Scanner) Errorf(format string, a ...interface{}) (err error) {
  92. str := fmt.Sprintf(format, a...)
  93. val := s.r.s[s.lastScanOffset:]
  94. var lenStr = ""
  95. if len(val) > 2048 {
  96. lenStr = "(total length " + strconv.Itoa(len(val)) + ")"
  97. val = val[:2048]
  98. }
  99. err = fmt.Errorf("line %d column %d near \"%s\"%s %s",
  100. s.r.p.Line, s.r.p.Col, val, str, lenStr)
  101. return
  102. }
  103. // AppendError sets error into scanner.
  104. // Scanner satisfies yyLexer interface which need this function.
  105. func (s *Scanner) AppendError(err error) {
  106. if err == nil {
  107. return
  108. }
  109. s.errs = append(s.errs, err)
  110. }
  111. // Lex returns a token and store the token value in v.
  112. // Scanner satisfies yyLexer interface.
  113. // 0 and invalid are special token id this function would return:
  114. // return 0 tells parser that scanner meets EOF,
  115. // return invalid tells parser that scanner meets illegal character.
  116. func (s *Scanner) Lex(v *yySymType) int {
  117. tok, pos, lit := s.scan()
  118. s.lastScanOffset = pos.Offset
  119. s.lastKeyword3 = s.lastKeyword2
  120. s.lastKeyword2 = s.lastKeyword
  121. s.lastKeyword = 0
  122. v.offset = int(pos.Offset)
  123. v.ident = lit
  124. if tok == identifier {
  125. tok = handleIdent(v)
  126. }
  127. if tok == identifier {
  128. if tok1 := s.isTokenIdentifier(lit, pos.Offset); tok1 != 0 {
  129. tok = tok1
  130. s.lastKeyword = tok1
  131. }
  132. }
  133. if s.sqlMode.HasANSIQuotesMode() &&
  134. tok == stringLit &&
  135. s.r.s[v.offset] == '"' {
  136. tok = identifier
  137. }
  138. if tok == pipes && !(s.sqlMode.HasPipesAsConcatMode()) {
  139. return pipesAsOr
  140. }
  141. if tok == not && s.sqlMode.HasHighNotPrecedenceMode() {
  142. return not2
  143. }
  144. switch tok {
  145. case intLit:
  146. return toInt(s, v, lit)
  147. case floatLit:
  148. return toFloat(s, v, lit)
  149. case decLit:
  150. return toDecimal(s, v, lit)
  151. case hexLit:
  152. return toHex(s, v, lit)
  153. case bitLit:
  154. return toBit(s, v, lit)
  155. case singleAtIdentifier, doubleAtIdentifier, cast, extract:
  156. v.item = lit
  157. return tok
  158. case null:
  159. v.item = nil
  160. case quotedIdentifier:
  161. tok = identifier
  162. }
  163. if tok == unicode.ReplacementChar {
  164. return invalid
  165. }
  166. return tok
  167. }
  168. // SetSQLMode sets the SQL mode for scanner.
  169. func (s *Scanner) SetSQLMode(mode mysql.SQLMode) {
  170. s.sqlMode = mode
  171. }
  172. // GetSQLMode return the SQL mode of scanner.
  173. func (s *Scanner) GetSQLMode() mysql.SQLMode {
  174. return s.sqlMode
  175. }
  176. // EnableWindowFunc controls whether the scanner recognize the keywords of window function.
  177. func (s *Scanner) EnableWindowFunc(val bool) {
  178. s.supportWindowFunc = val
  179. }
  180. // InheritScanner returns a new scanner object which inherits configurations from the parent scanner.
  181. func (s *Scanner) InheritScanner(sql string) *Scanner {
  182. return &Scanner{
  183. r: reader{s: sql},
  184. sqlMode: s.sqlMode,
  185. supportWindowFunc: s.supportWindowFunc,
  186. }
  187. }
  188. // NewScanner returns a new scanner object.
  189. func NewScanner(s string) *Scanner {
  190. return &Scanner{r: reader{s: s}}
  191. }
  192. func (s *Scanner) skipWhitespace() rune {
  193. return s.r.incAsLongAs(unicode.IsSpace)
  194. }
  195. func (s *Scanner) scan() (tok int, pos Pos, lit string) {
  196. ch0 := s.r.peek()
  197. if unicode.IsSpace(ch0) {
  198. ch0 = s.skipWhitespace()
  199. }
  200. pos = s.r.pos()
  201. if s.r.eof() {
  202. // when scanner meets EOF, the returned token should be 0,
  203. // because 0 is a special token id to remind the parser that stream is end.
  204. return 0, pos, ""
  205. }
  206. if !s.r.eof() && isIdentExtend(ch0) {
  207. return scanIdentifier(s)
  208. }
  209. // search a trie to get a token.
  210. node := &ruleTable
  211. for ch0 >= 0 && ch0 <= 255 {
  212. if node.childs[ch0] == nil || s.r.eof() {
  213. break
  214. }
  215. node = node.childs[ch0]
  216. if node.fn != nil {
  217. return node.fn(s)
  218. }
  219. s.r.inc()
  220. ch0 = s.r.peek()
  221. }
  222. tok, lit = node.token, s.r.data(&pos)
  223. return
  224. }
  225. func startWithXx(s *Scanner) (tok int, pos Pos, lit string) {
  226. pos = s.r.pos()
  227. s.r.inc()
  228. if s.r.peek() == '\'' {
  229. s.r.inc()
  230. s.scanHex()
  231. if s.r.peek() == '\'' {
  232. s.r.inc()
  233. tok, lit = hexLit, s.r.data(&pos)
  234. } else {
  235. tok = unicode.ReplacementChar
  236. }
  237. return
  238. }
  239. s.r.p = pos
  240. return scanIdentifier(s)
  241. }
  242. func startWithNn(s *Scanner) (tok int, pos Pos, lit string) {
  243. tok, pos, lit = scanIdentifier(s)
  244. // The National Character Set, N'some text' or n'some test'.
  245. // See https://dev.mysql.com/doc/refman/5.7/en/string-literals.html
  246. // and https://dev.mysql.com/doc/refman/5.7/en/charset-national.html
  247. if lit == "N" || lit == "n" {
  248. if s.r.peek() == '\'' {
  249. tok = underscoreCS
  250. lit = "utf8"
  251. }
  252. }
  253. return
  254. }
  255. func startWithBb(s *Scanner) (tok int, pos Pos, lit string) {
  256. pos = s.r.pos()
  257. s.r.inc()
  258. if s.r.peek() == '\'' {
  259. s.r.inc()
  260. s.scanBit()
  261. if s.r.peek() == '\'' {
  262. s.r.inc()
  263. tok, lit = bitLit, s.r.data(&pos)
  264. } else {
  265. tok = unicode.ReplacementChar
  266. }
  267. return
  268. }
  269. s.r.p = pos
  270. return scanIdentifier(s)
  271. }
  272. func startWithSharp(s *Scanner) (tok int, pos Pos, lit string) {
  273. s.r.incAsLongAs(func(ch rune) bool {
  274. return ch != '\n'
  275. })
  276. return s.scan()
  277. }
  278. func startWithDash(s *Scanner) (tok int, pos Pos, lit string) {
  279. pos = s.r.pos()
  280. if strings.HasPrefix(s.r.s[pos.Offset:], "--") {
  281. remainLen := len(s.r.s[pos.Offset:])
  282. if remainLen == 2 || (remainLen > 2 && unicode.IsSpace(rune(s.r.s[pos.Offset+2]))) {
  283. s.r.incAsLongAs(func(ch rune) bool {
  284. return ch != '\n'
  285. })
  286. return s.scan()
  287. }
  288. }
  289. if strings.HasPrefix(s.r.s[pos.Offset:], "->>") {
  290. tok = juss
  291. s.r.incN(3)
  292. return
  293. }
  294. if strings.HasPrefix(s.r.s[pos.Offset:], "->") {
  295. tok = jss
  296. s.r.incN(2)
  297. return
  298. }
  299. tok = int('-')
  300. lit = "-"
  301. s.r.inc()
  302. return
  303. }
  304. func startWithSlash(s *Scanner) (tok int, pos Pos, lit string) {
  305. pos = s.r.pos()
  306. s.r.inc()
  307. if s.r.peek() != '*' {
  308. tok = int('/')
  309. lit = "/"
  310. return
  311. }
  312. isOptimizerHint := false
  313. currentCharIsStar := false
  314. s.r.inc() // we see '/*' so far.
  315. switch s.r.readByte() {
  316. case '!': // '/*!' MySQL-specific comments
  317. // See http://dev.mysql.com/doc/refman/5.7/en/comments.html
  318. // in '/*!', which we always recognize regardless of version.
  319. s.scanVersionDigits(5, 5)
  320. s.inBangComment = true
  321. return s.scan()
  322. case 'T': // '/*T' maybe TiDB-specific comments
  323. if s.r.peek() != '!' {
  324. // '/*TX' is just normal comment.
  325. break
  326. }
  327. s.r.inc()
  328. // in '/*T!', try to match the pattern '/*T![feature1,feature2,...]'.
  329. features := s.scanFeatureIDs()
  330. if SpecialCommentsController.ContainsAll(features) {
  331. s.inBangComment = true
  332. return s.scan()
  333. }
  334. case 'M': // '/*M' maybe MariaDB-specific comments
  335. // no special treatment for now.
  336. break
  337. case '+': // '/*+' optimizer hints
  338. // See https://dev.mysql.com/doc/refman/5.7/en/optimizer-hints.html
  339. if _, ok := hintedTokens[s.lastKeyword]; ok {
  340. // only recognize optimizers hints directly followed by certain
  341. // keywords like SELECT, INSERT, etc., only a special case "FOR UPDATE" needs to be handled
  342. // we will report a warning in order to match MySQL's behavior, but the hint content will be ignored
  343. if s.lastKeyword2 == forKwd {
  344. if s.lastKeyword3 == binding {
  345. // special case of `create binding for update`
  346. isOptimizerHint = true
  347. } else {
  348. s.warns = append(s.warns, ParseErrorWith(s.r.data(&pos), s.r.p.Line))
  349. }
  350. } else {
  351. isOptimizerHint = true
  352. }
  353. }
  354. case '*': // '/**' if the next char is '/' it would close the comment.
  355. currentCharIsStar = true
  356. default:
  357. break
  358. }
  359. // standard C-like comment. read until we see '*/' then drop it.
  360. for {
  361. if currentCharIsStar || s.r.incAsLongAs(func(ch rune) bool { return ch != '*' }) == '*' {
  362. switch s.r.readByte() {
  363. case '/':
  364. // Meets */, means comment end.
  365. if isOptimizerHint {
  366. s.lastHintPos = pos
  367. return hintComment, pos, s.r.data(&pos)
  368. } else {
  369. return s.scan()
  370. }
  371. case 0:
  372. break
  373. case '*':
  374. currentCharIsStar = true
  375. continue
  376. default:
  377. currentCharIsStar = false
  378. continue
  379. }
  380. }
  381. // unclosed comment or other errors.
  382. s.errs = append(s.errs, ParseErrorWith(s.r.data(&pos), s.r.p.Line))
  383. return
  384. }
  385. }
  386. func startWithStar(s *Scanner) (tok int, pos Pos, lit string) {
  387. pos = s.r.pos()
  388. s.r.inc()
  389. // skip and exit '/*!' if we see '*/'
  390. if s.inBangComment && s.r.peek() == '/' {
  391. s.inBangComment = false
  392. s.r.inc()
  393. return s.scan()
  394. }
  395. // otherwise it is just a normal star.
  396. s.identifierDot = false
  397. return '*', pos, "*"
  398. }
  399. func startWithAt(s *Scanner) (tok int, pos Pos, lit string) {
  400. pos = s.r.pos()
  401. s.r.inc()
  402. tok, lit = scanIdentifierOrString(s)
  403. switch tok {
  404. case '@':
  405. s.r.inc()
  406. stream := s.r.s[pos.Offset+2:]
  407. var prefix string
  408. for _, v := range []string{"global.", "session.", "local."} {
  409. if len(v) > len(stream) {
  410. continue
  411. }
  412. if strings.EqualFold(stream[:len(v)], v) {
  413. prefix = v
  414. s.r.incN(len(v))
  415. break
  416. }
  417. }
  418. tok, lit = scanIdentifierOrString(s)
  419. switch tok {
  420. case stringLit, quotedIdentifier:
  421. tok, lit = doubleAtIdentifier, "@@"+prefix+lit
  422. case identifier:
  423. tok, lit = doubleAtIdentifier, s.r.data(&pos)
  424. }
  425. case unicode.ReplacementChar:
  426. break
  427. default:
  428. tok = singleAtIdentifier
  429. }
  430. return
  431. }
  432. func scanIdentifier(s *Scanner) (int, Pos, string) {
  433. pos := s.r.pos()
  434. s.r.incAsLongAs(isIdentChar)
  435. s.identifierDot = s.r.peek() == '.'
  436. return identifier, pos, s.r.data(&pos)
  437. }
  438. func scanIdentifierOrString(s *Scanner) (tok int, lit string) {
  439. ch1 := s.r.peek()
  440. switch ch1 {
  441. case '\'', '"':
  442. tok, _, lit = startString(s)
  443. case '`':
  444. tok, _, lit = scanQuotedIdent(s)
  445. default:
  446. if isUserVarChar(ch1) {
  447. pos := s.r.pos()
  448. s.r.incAsLongAs(isUserVarChar)
  449. tok, lit = identifier, s.r.data(&pos)
  450. } else {
  451. tok = int(ch1)
  452. }
  453. }
  454. return
  455. }
  456. var (
  457. quotedIdentifier = -identifier
  458. )
  459. func scanQuotedIdent(s *Scanner) (tok int, pos Pos, lit string) {
  460. pos = s.r.pos()
  461. s.r.inc()
  462. s.buf.Reset()
  463. for {
  464. ch := s.r.readByte()
  465. if ch == unicode.ReplacementChar && s.r.eof() {
  466. tok = unicode.ReplacementChar
  467. return
  468. }
  469. if ch == '`' {
  470. if s.r.peek() != '`' {
  471. // don't return identifier in case that it's interpreted as keyword token later.
  472. tok, lit = quotedIdentifier, s.buf.String()
  473. s.identifierDot = false
  474. return
  475. }
  476. s.r.inc()
  477. }
  478. s.buf.WriteRune(ch)
  479. }
  480. }
  481. func startString(s *Scanner) (tok int, pos Pos, lit string) {
  482. return s.scanString()
  483. }
  484. // lazyBuf is used to avoid allocation if possible.
  485. // it has a useBuf field indicates whether bytes.Buffer is necessary. if
  486. // useBuf is false, we can avoid calling bytes.Buffer.String(), which
  487. // make a copy of data and cause allocation.
  488. type lazyBuf struct {
  489. useBuf bool
  490. r *reader
  491. b *bytes.Buffer
  492. p *Pos
  493. }
  494. func (mb *lazyBuf) setUseBuf(str string) {
  495. if !mb.useBuf {
  496. mb.useBuf = true
  497. mb.b.Reset()
  498. mb.b.WriteString(str)
  499. }
  500. }
  501. func (mb *lazyBuf) writeRune(r rune, w int) {
  502. if mb.useBuf {
  503. if w > 1 {
  504. mb.b.WriteRune(r)
  505. } else {
  506. mb.b.WriteByte(byte(r))
  507. }
  508. }
  509. }
  510. func (mb *lazyBuf) data() string {
  511. var lit string
  512. if mb.useBuf {
  513. lit = mb.b.String()
  514. } else {
  515. lit = mb.r.data(mb.p)
  516. lit = lit[1 : len(lit)-1]
  517. }
  518. return lit
  519. }
  520. func (s *Scanner) scanString() (tok int, pos Pos, lit string) {
  521. tok, pos = stringLit, s.r.pos()
  522. mb := lazyBuf{false, &s.r, &s.buf, &pos}
  523. ending := s.r.readByte()
  524. ch0 := s.r.peek()
  525. for !s.r.eof() {
  526. if ch0 == ending {
  527. s.r.inc()
  528. if s.r.peek() != ending {
  529. lit = mb.data()
  530. return
  531. }
  532. str := mb.r.data(&pos)
  533. mb.setUseBuf(str[1 : len(str)-1])
  534. } else if ch0 == '\\' && !s.sqlMode.HasNoBackslashEscapesMode() {
  535. mb.setUseBuf(mb.r.data(&pos)[1:])
  536. ch0 = handleEscape(s)
  537. }
  538. mb.writeRune(ch0, s.r.w)
  539. if !s.r.eof() {
  540. s.r.inc()
  541. ch0 = s.r.peek()
  542. }
  543. }
  544. tok = unicode.ReplacementChar
  545. return
  546. }
  547. // handleEscape handles the case in scanString when previous char is '\'.
  548. func handleEscape(s *Scanner) rune {
  549. s.r.inc()
  550. ch0 := s.r.peek()
  551. /*
  552. \" \' \\ \n \0 \b \Z \r \t ==> escape to one char
  553. \% \_ ==> preserve both char
  554. other ==> remove \
  555. */
  556. switch ch0 {
  557. case 'n':
  558. ch0 = '\n'
  559. case '0':
  560. ch0 = 0
  561. case 'b':
  562. ch0 = 8
  563. case 'Z':
  564. ch0 = 26
  565. case 'r':
  566. ch0 = '\r'
  567. case 't':
  568. ch0 = '\t'
  569. case '%', '_':
  570. s.buf.WriteByte('\\')
  571. }
  572. return ch0
  573. }
  574. func startWithNumber(s *Scanner) (tok int, pos Pos, lit string) {
  575. if s.identifierDot {
  576. return scanIdentifier(s)
  577. }
  578. pos = s.r.pos()
  579. tok = intLit
  580. ch0 := s.r.readByte()
  581. if ch0 == '0' {
  582. tok = intLit
  583. ch1 := s.r.peek()
  584. switch {
  585. case ch1 >= '0' && ch1 <= '7':
  586. s.r.inc()
  587. s.scanOct()
  588. case ch1 == 'x' || ch1 == 'X':
  589. s.r.inc()
  590. p1 := s.r.pos()
  591. s.scanHex()
  592. p2 := s.r.pos()
  593. // 0x, 0x7fz3 are identifier
  594. if p1 == p2 || isDigit(s.r.peek()) {
  595. s.r.incAsLongAs(isIdentChar)
  596. return identifier, pos, s.r.data(&pos)
  597. }
  598. tok = hexLit
  599. case ch1 == 'b':
  600. s.r.inc()
  601. p1 := s.r.pos()
  602. s.scanBit()
  603. p2 := s.r.pos()
  604. // 0b, 0b123, 0b1ab are identifier
  605. if p1 == p2 || isDigit(s.r.peek()) {
  606. s.r.incAsLongAs(isIdentChar)
  607. return identifier, pos, s.r.data(&pos)
  608. }
  609. tok = bitLit
  610. case ch1 == '.':
  611. return s.scanFloat(&pos)
  612. case ch1 == 'B':
  613. s.r.incAsLongAs(isIdentChar)
  614. return identifier, pos, s.r.data(&pos)
  615. }
  616. }
  617. s.scanDigits()
  618. ch0 = s.r.peek()
  619. if ch0 == '.' || ch0 == 'e' || ch0 == 'E' {
  620. return s.scanFloat(&pos)
  621. }
  622. // Identifiers may begin with a digit but unless quoted may not consist solely of digits.
  623. if !s.r.eof() && isIdentChar(ch0) {
  624. s.r.incAsLongAs(isIdentChar)
  625. return identifier, pos, s.r.data(&pos)
  626. }
  627. lit = s.r.data(&pos)
  628. return
  629. }
  630. func startWithDot(s *Scanner) (tok int, pos Pos, lit string) {
  631. pos = s.r.pos()
  632. s.r.inc()
  633. if s.identifierDot {
  634. return int('.'), pos, "."
  635. }
  636. if isDigit(s.r.peek()) {
  637. tok, p, l := s.scanFloat(&pos)
  638. if tok == identifier {
  639. return invalid, p, l
  640. }
  641. return tok, p, l
  642. }
  643. tok, lit = int('.'), "."
  644. return
  645. }
  646. func (s *Scanner) scanOct() {
  647. s.r.incAsLongAs(func(ch rune) bool {
  648. return ch >= '0' && ch <= '7'
  649. })
  650. }
  651. func (s *Scanner) scanHex() {
  652. s.r.incAsLongAs(func(ch rune) bool {
  653. return ch >= '0' && ch <= '9' ||
  654. ch >= 'a' && ch <= 'f' ||
  655. ch >= 'A' && ch <= 'F'
  656. })
  657. }
  658. func (s *Scanner) scanBit() {
  659. s.r.incAsLongAs(func(ch rune) bool {
  660. return ch == '0' || ch == '1'
  661. })
  662. }
  663. func (s *Scanner) scanFloat(beg *Pos) (tok int, pos Pos, lit string) {
  664. s.r.p = *beg
  665. // float = D1 . D2 e D3
  666. s.scanDigits()
  667. ch0 := s.r.peek()
  668. if ch0 == '.' {
  669. s.r.inc()
  670. s.scanDigits()
  671. ch0 = s.r.peek()
  672. }
  673. if ch0 == 'e' || ch0 == 'E' {
  674. s.r.inc()
  675. ch0 = s.r.peek()
  676. if ch0 == '-' || ch0 == '+' {
  677. s.r.inc()
  678. }
  679. if isDigit(s.r.peek()) {
  680. s.scanDigits()
  681. tok = floatLit
  682. } else {
  683. // D1 . D2 e XX when XX is not D3, parse the result to an identifier.
  684. // 9e9e = 9e9(float) + e(identifier)
  685. // 9est = 9est(identifier)
  686. s.r.p = *beg
  687. s.r.incAsLongAs(isIdentChar)
  688. tok = identifier
  689. }
  690. } else {
  691. tok = decLit
  692. }
  693. pos, lit = *beg, s.r.data(beg)
  694. return
  695. }
  696. func (s *Scanner) scanDigits() string {
  697. pos := s.r.pos()
  698. s.r.incAsLongAs(isDigit)
  699. return s.r.data(&pos)
  700. }
  701. // scanVersionDigits scans for `min` to `max` digits (range inclusive) used in
  702. // `/*!12345 ... */` comments.
  703. func (s *Scanner) scanVersionDigits(min, max int) {
  704. pos := s.r.pos()
  705. for i := 0; i < max; i++ {
  706. ch := s.r.peek()
  707. if isDigit(ch) {
  708. s.r.inc()
  709. } else if i < min {
  710. s.r.p = pos
  711. return
  712. } else {
  713. break
  714. }
  715. }
  716. }
  717. func (s *Scanner) scanFeatureIDs() (featureIDs []string) {
  718. pos := s.r.pos()
  719. const init, expectChar, obtainChar = 0, 1, 2
  720. state := init
  721. var b strings.Builder
  722. for !s.r.eof() {
  723. ch := s.r.peek()
  724. s.r.inc()
  725. switch state {
  726. case init:
  727. if ch == '[' {
  728. state = expectChar
  729. break
  730. }
  731. s.r.p = pos
  732. return nil
  733. case expectChar:
  734. if isIdentChar(ch) {
  735. b.WriteRune(ch)
  736. state = obtainChar
  737. break
  738. }
  739. s.r.p = pos
  740. return nil
  741. case obtainChar:
  742. if isIdentChar(ch) {
  743. b.WriteRune(ch)
  744. state = obtainChar
  745. break
  746. } else if ch == ',' {
  747. featureIDs = append(featureIDs, b.String())
  748. b.Reset()
  749. state = expectChar
  750. break
  751. } else if ch == ']' {
  752. featureIDs = append(featureIDs, b.String())
  753. return featureIDs
  754. }
  755. s.r.p = pos
  756. return nil
  757. }
  758. }
  759. s.r.p = pos
  760. return nil
  761. }
  762. func (s *Scanner) lastErrorAsWarn() {
  763. if len(s.errs) == 0 {
  764. return
  765. }
  766. s.warns = append(s.warns, s.errs[len(s.errs)-1])
  767. s.errs = s.errs[:len(s.errs)-1]
  768. }
  769. type reader struct {
  770. s string
  771. p Pos
  772. w int
  773. }
  774. var eof = Pos{-1, -1, -1}
  775. func (r *reader) eof() bool {
  776. return r.p.Offset >= len(r.s)
  777. }
  778. // peek() peeks a rune from underlying reader.
  779. // if reader meets EOF, it will return unicode.ReplacementChar. to distinguish from
  780. // the real unicode.ReplacementChar, the caller should call r.eof() again to check.
  781. func (r *reader) peek() rune {
  782. if r.eof() {
  783. return unicode.ReplacementChar
  784. }
  785. v, w := rune(r.s[r.p.Offset]), 1
  786. switch {
  787. case v == 0:
  788. r.w = w
  789. return v // illegal UTF-8 encoding
  790. case v >= 0x80:
  791. v, w = utf8.DecodeRuneInString(r.s[r.p.Offset:])
  792. if v == utf8.RuneError && w == 1 {
  793. v = rune(r.s[r.p.Offset]) // illegal UTF-8 encoding
  794. }
  795. }
  796. r.w = w
  797. return v
  798. }
  799. // inc increase the position offset of the reader.
  800. // peek must be called before calling inc!
  801. func (r *reader) inc() {
  802. if r.s[r.p.Offset] == '\n' {
  803. r.p.Line++
  804. r.p.Col = 0
  805. }
  806. r.p.Offset += r.w
  807. r.p.Col++
  808. }
  809. func (r *reader) incN(n int) {
  810. for i := 0; i < n; i++ {
  811. r.inc()
  812. }
  813. }
  814. func (r *reader) readByte() (ch rune) {
  815. ch = r.peek()
  816. if ch == unicode.ReplacementChar && r.eof() {
  817. return
  818. }
  819. r.inc()
  820. return
  821. }
  822. func (r *reader) pos() Pos {
  823. return r.p
  824. }
  825. func (r *reader) data(from *Pos) string {
  826. return r.s[from.Offset:r.p.Offset]
  827. }
  828. func (r *reader) incAsLongAs(fn func(rune) bool) rune {
  829. for {
  830. ch := r.peek()
  831. if !fn(ch) {
  832. return ch
  833. }
  834. if ch == unicode.ReplacementChar && r.eof() {
  835. return 0
  836. }
  837. r.inc()
  838. }
  839. }