digester.go 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415
  1. // Copyright 2019 PingCAP, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // See the License for the specific language governing permissions and
  12. // limitations under the License.
  13. package parser
  14. import (
  15. "bytes"
  16. "crypto/sha256"
  17. "fmt"
  18. hash2 "hash"
  19. "reflect"
  20. "strings"
  21. "sync"
  22. "unicode"
  23. "unsafe"
  24. "github.com/pingcap/parser/charset"
  25. )
  26. // DigestHash generates the digest of statements.
  27. // it will generate a hash on normalized form of statement text
  28. // which removes general property of a statement but keeps specific property.
  29. //
  30. // for example: both DigestHash('select 1') and DigestHash('select 2') => e1c71d1661ae46e09b7aaec1c390957f0d6260410df4e4bc71b9c8d681021471
  31. //
  32. // Deprecated: It is logically consistent with NormalizeDigest.
  33. func DigestHash(sql string) (result string) {
  34. d := digesterPool.Get().(*sqlDigester)
  35. result = d.doDigest(sql)
  36. digesterPool.Put(d)
  37. return
  38. }
  39. // DigestNormalized generates the digest of a normalized sql.
  40. // it will generate a hash on a normalized sql.
  41. // Normalize + DigestNormalized equals to NormalizeDigest.
  42. //
  43. // for example: DigestNormalized('select ?')
  44. // DigestNormalized should be called with a normalized SQL string (like 'select ?') generated by function Normalize.
  45. // do not call with SQL which is not normalized, DigestNormalized('select 1') and DigestNormalized('select 2') is not the same
  46. func DigestNormalized(normalized string) (result string) {
  47. d := digesterPool.Get().(*sqlDigester)
  48. result = d.doDigestNormalized(normalized)
  49. digesterPool.Put(d)
  50. return
  51. }
  52. // Normalize generates the normalized statements.
  53. // it will get normalized form of statement text
  54. // which removes general property of a statement but keeps specific property.
  55. //
  56. // for example: Normalize('select 1 from b where a = 1') => 'select ? from b where a = ?'
  57. func Normalize(sql string) (result string) {
  58. d := digesterPool.Get().(*sqlDigester)
  59. result = d.doNormalize(sql)
  60. digesterPool.Put(d)
  61. return
  62. }
  63. // NormalizeDigest combines Normalize and DigestNormalized into one method.
  64. func NormalizeDigest(sql string) (normalized, digest string) {
  65. d := digesterPool.Get().(*sqlDigester)
  66. normalized, digest = d.doNormalizeDigest(sql)
  67. digesterPool.Put(d)
  68. return
  69. }
  70. var digesterPool = sync.Pool{
  71. New: func() interface{} {
  72. return &sqlDigester{
  73. lexer: NewScanner(""),
  74. hasher: sha256.New(),
  75. }
  76. },
  77. }
  78. // sqlDigester is used to compute DigestHash or Normalize for sql.
  79. type sqlDigester struct {
  80. buffer bytes.Buffer
  81. lexer *Scanner
  82. hasher hash2.Hash
  83. tokens tokenDeque
  84. }
  85. func (d *sqlDigester) doDigestNormalized(normalized string) (result string) {
  86. hdr := *(*reflect.StringHeader)(unsafe.Pointer(&normalized))
  87. b := *(*[]byte)(unsafe.Pointer(&reflect.SliceHeader{
  88. Data: hdr.Data,
  89. Len: hdr.Len,
  90. Cap: hdr.Len,
  91. }))
  92. d.hasher.Write(b)
  93. result = fmt.Sprintf("%x", d.hasher.Sum(nil))
  94. d.hasher.Reset()
  95. return
  96. }
  97. func (d *sqlDigester) doDigest(sql string) (result string) {
  98. d.normalize(sql)
  99. d.hasher.Write(d.buffer.Bytes())
  100. d.buffer.Reset()
  101. result = fmt.Sprintf("%x", d.hasher.Sum(nil))
  102. d.hasher.Reset()
  103. return
  104. }
  105. func (d *sqlDigester) doNormalize(sql string) (result string) {
  106. d.normalize(sql)
  107. result = d.buffer.String()
  108. d.buffer.Reset()
  109. return
  110. }
  111. func (d *sqlDigester) doNormalizeDigest(sql string) (normalized, digest string) {
  112. d.normalize(sql)
  113. normalized = d.buffer.String()
  114. d.hasher.Write(d.buffer.Bytes())
  115. d.buffer.Reset()
  116. digest = fmt.Sprintf("%x", d.hasher.Sum(nil))
  117. d.hasher.Reset()
  118. return
  119. }
  120. const (
  121. // genericSymbol presents parameter holder ("?") in statement
  122. // it can be any value as long as it is not repeated with other tokens.
  123. genericSymbol = -1
  124. // genericSymbolList presents parameter holder lists ("?, ?, ...") in statement
  125. // it can be any value as long as it is not repeated with other tokens.
  126. genericSymbolList = -2
  127. )
  128. func (d *sqlDigester) normalize(sql string) {
  129. d.lexer.reset(sql)
  130. for {
  131. tok, pos, lit := d.lexer.scan()
  132. if tok == invalid {
  133. break
  134. }
  135. if tok == unicode.ReplacementChar && d.lexer.r.eof() {
  136. break
  137. }
  138. if pos.Offset == len(sql) {
  139. break
  140. }
  141. currTok := token{tok, strings.ToLower(lit)}
  142. if d.reduceOptimizerHint(&currTok) {
  143. continue
  144. }
  145. d.reduceLit(&currTok)
  146. if currTok.tok == identifier {
  147. if strings.HasPrefix(currTok.lit, "_") {
  148. _, _, err := charset.GetCharsetInfo(currTok.lit[1:])
  149. if err == nil {
  150. currTok.tok = underscoreCS
  151. goto APPEND
  152. }
  153. }
  154. if tok1 := d.lexer.isTokenIdentifier(currTok.lit, pos.Offset); tok1 != 0 {
  155. currTok.tok = tok1
  156. }
  157. }
  158. APPEND:
  159. d.tokens.pushBack(currTok)
  160. }
  161. d.lexer.reset("")
  162. for i, token := range d.tokens {
  163. if i > 0 {
  164. d.buffer.WriteRune(' ')
  165. }
  166. if token.tok == singleAtIdentifier {
  167. d.buffer.WriteString("@")
  168. d.buffer.WriteString(token.lit)
  169. } else if token.tok == underscoreCS {
  170. d.buffer.WriteString("(_charset)")
  171. } else if token.tok == identifier || token.tok == quotedIdentifier {
  172. d.buffer.WriteByte('`')
  173. d.buffer.WriteString(token.lit)
  174. d.buffer.WriteByte('`')
  175. } else {
  176. d.buffer.WriteString(token.lit)
  177. }
  178. }
  179. d.tokens.reset()
  180. }
  181. func (d *sqlDigester) reduceOptimizerHint(tok *token) (reduced bool) {
  182. // ignore /*+..*/
  183. if tok.tok == hintComment {
  184. return
  185. }
  186. // ignore force/use/ignore index(x)
  187. if tok.lit == "index" {
  188. toks := d.tokens.back(1)
  189. if len(toks) > 0 {
  190. switch strings.ToLower(toks[0].lit) {
  191. case "force", "use", "ignore":
  192. for {
  193. tok, _, lit := d.lexer.scan()
  194. if tok == 0 || (tok == unicode.ReplacementChar && d.lexer.r.eof()) {
  195. break
  196. }
  197. if lit == ")" {
  198. reduced = true
  199. d.tokens.popBack(1)
  200. break
  201. }
  202. }
  203. return
  204. }
  205. }
  206. }
  207. // ignore straight_join
  208. if tok.lit == "straight_join" {
  209. tok.lit = "join"
  210. return
  211. }
  212. return
  213. }
  214. func (d *sqlDigester) reduceLit(currTok *token) {
  215. if !d.isLit(*currTok) {
  216. return
  217. }
  218. // count(*) => count(?)
  219. if currTok.lit == "*" {
  220. if d.isStarParam() {
  221. currTok.tok = genericSymbol
  222. currTok.lit = "?"
  223. }
  224. return
  225. }
  226. // "-x" or "+x" => "x"
  227. if d.isPrefixByUnary(currTok.tok) {
  228. d.tokens.popBack(1)
  229. }
  230. // "?, ?, ?, ?" => "..."
  231. last2 := d.tokens.back(2)
  232. if d.isGenericList(last2) {
  233. d.tokens.popBack(2)
  234. currTok.tok = genericSymbolList
  235. currTok.lit = "..."
  236. return
  237. }
  238. // order by n => order by n
  239. if currTok.tok == intLit {
  240. if d.isOrderOrGroupBy() {
  241. return
  242. }
  243. }
  244. // 2 => ?
  245. currTok.tok = genericSymbol
  246. currTok.lit = "?"
  247. }
  248. func (d *sqlDigester) isPrefixByUnary(currTok int) (isUnary bool) {
  249. if !d.isNumLit(currTok) {
  250. return
  251. }
  252. last := d.tokens.back(1)
  253. if last == nil {
  254. return
  255. }
  256. // a[0] != '-' and a[0] != '+'
  257. if last[0].lit != "-" && last[0].lit != "+" {
  258. return
  259. }
  260. last2 := d.tokens.back(2)
  261. if last2 == nil {
  262. isUnary = true
  263. return
  264. }
  265. // '(-x' or ',-x' or ',+x' or '--x' or '+-x'
  266. switch last2[0].lit {
  267. case "(", ",", "+", "-", ">=", "is", "<=", "=", "<", ">":
  268. isUnary = true
  269. default:
  270. }
  271. // select -x or select +x
  272. last2Lit := strings.ToLower(last2[0].lit)
  273. if last2Lit == "select" {
  274. isUnary = true
  275. }
  276. return
  277. }
  278. func (d *sqlDigester) isGenericList(last2 []token) (generic bool) {
  279. if len(last2) < 2 {
  280. return false
  281. }
  282. if !d.isComma(last2[1]) {
  283. return false
  284. }
  285. switch last2[0].tok {
  286. case genericSymbol, genericSymbolList:
  287. generic = true
  288. default:
  289. }
  290. return
  291. }
  292. func (d *sqlDigester) isOrderOrGroupBy() (orderOrGroupBy bool) {
  293. var (
  294. last []token
  295. n int
  296. )
  297. // skip number item lists, e.g. "order by 1, 2, 3" should NOT convert to "order by ?, ?, ?"
  298. for n = 2; ; n += 2 {
  299. last = d.tokens.back(n)
  300. if len(last) < 2 {
  301. return false
  302. }
  303. if !d.isComma(last[1]) {
  304. break
  305. }
  306. }
  307. // handle group by number item list surround by "()", e.g. "group by (1, 2)" should not convert to "group by (?, ?)"
  308. if last[1].lit == "(" {
  309. last = d.tokens.back(n + 1)
  310. if len(last) < 2 {
  311. return false
  312. }
  313. }
  314. orderOrGroupBy = (last[0].lit == "order" || last[0].lit == "group") && last[1].lit == "by"
  315. return
  316. }
  317. func (d *sqlDigester) isStarParam() (starParam bool) {
  318. last := d.tokens.back(1)
  319. if last == nil {
  320. starParam = false
  321. return
  322. }
  323. starParam = last[0].lit == "("
  324. return
  325. }
  326. func (d *sqlDigester) isLit(t token) (beLit bool) {
  327. tok := t.tok
  328. if d.isNumLit(tok) || tok == stringLit || tok == bitLit || tok == paramMarker {
  329. beLit = true
  330. } else if t.lit == "*" {
  331. beLit = true
  332. } else if tok == null || (tok == identifier && strings.ToLower(t.lit) == "null") {
  333. beLit = true
  334. }
  335. return
  336. }
  337. func (d *sqlDigester) isNumLit(tok int) (beNum bool) {
  338. switch tok {
  339. case intLit, decLit, floatLit, hexLit:
  340. beNum = true
  341. default:
  342. }
  343. return
  344. }
  345. func (d *sqlDigester) isComma(tok token) (isComma bool) {
  346. isComma = tok.lit == ","
  347. return
  348. }
  349. type token struct {
  350. tok int
  351. lit string
  352. }
  353. type tokenDeque []token
  354. func (s *tokenDeque) reset() {
  355. *s = (*s)[:0]
  356. }
  357. func (s *tokenDeque) pushBack(t token) {
  358. *s = append(*s, t)
  359. }
  360. func (s *tokenDeque) popBack(n int) (t []token) {
  361. if len(*s) < n {
  362. t = nil
  363. return
  364. }
  365. t = (*s)[len(*s)-n:]
  366. *s = (*s)[:len(*s)-n]
  367. return
  368. }
  369. func (s *tokenDeque) back(n int) (t []token) {
  370. if len(*s)-n < 0 {
  371. return
  372. }
  373. t = (*s)[len(*s)-n:]
  374. return
  375. }