charset.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. // Copyright 2015 PingCAP, Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // See the License for the specific language governing permissions and
  12. // limitations under the License.
  13. package charset
  14. import (
  15. "strings"
  16. "github.com/pingcap/errors"
  17. "github.com/pingcap/parser/mysql"
  18. "github.com/pingcap/parser/terror"
  19. )
  20. var (
  21. ErrUnknownCollation = terror.ClassDDL.NewStd(mysql.ErrUnknownCollation)
  22. ErrCollationCharsetMismatch = terror.ClassDDL.NewStd(mysql.ErrCollationCharsetMismatch)
  23. )
  24. // Charset is a charset.
  25. // Now we only support MySQL.
  26. type Charset struct {
  27. Name string
  28. DefaultCollation string
  29. Collations map[string]*Collation
  30. Desc string
  31. Maxlen int
  32. }
  33. // Collation is a collation.
  34. // Now we only support MySQL.
  35. type Collation struct {
  36. ID int
  37. CharsetName string
  38. Name string
  39. IsDefault bool
  40. }
  41. var charsets = make(map[string]*Charset)
  42. var collationsIDMap = make(map[int]*Collation)
  43. var collationsNameMap = make(map[string]*Collation)
  44. var descs = make([]*Desc, 0, len(charsetInfos))
  45. var supportedCollations = make([]*Collation, 0, len(supportedCollationNames))
  46. // All the supported charsets should be in the following table.
  47. var charsetInfos = []*Charset{
  48. {CharsetUTF8, CollationUTF8, make(map[string]*Collation), "UTF-8 Unicode", 3},
  49. {CharsetUTF8MB4, CollationUTF8MB4, make(map[string]*Collation), "UTF-8 Unicode", 4},
  50. {CharsetASCII, CollationASCII, make(map[string]*Collation), "US ASCII", 1},
  51. {CharsetLatin1, CollationLatin1, make(map[string]*Collation), "Latin1", 1},
  52. {CharsetBin, CollationBin, make(map[string]*Collation), "binary", 1},
  53. }
  54. // All the names supported collations should be in the following table.
  55. var supportedCollationNames = map[string]struct{}{
  56. CollationUTF8: {},
  57. CollationUTF8MB4: {},
  58. CollationASCII: {},
  59. CollationLatin1: {},
  60. CollationBin: {},
  61. }
  62. // Desc is a charset description.
  63. type Desc struct {
  64. Name string
  65. Desc string
  66. DefaultCollation string
  67. Maxlen int
  68. }
  69. // GetSupportedCharsets gets descriptions for all charsets supported so far.
  70. func GetSupportedCharsets() []*Desc {
  71. return descs
  72. }
  73. // GetSupportedCollations gets information for all collations supported so far.
  74. func GetSupportedCollations() []*Collation {
  75. return supportedCollations
  76. }
  77. // ValidCharsetAndCollation checks the charset and the collation validity
  78. // and returns a boolean.
  79. func ValidCharsetAndCollation(cs string, co string) bool {
  80. // We will use utf8 as a default charset.
  81. if cs == "" {
  82. cs = "utf8"
  83. }
  84. cs = strings.ToLower(cs)
  85. c, ok := charsets[cs]
  86. if !ok {
  87. return false
  88. }
  89. if co == "" {
  90. return true
  91. }
  92. co = strings.ToLower(co)
  93. _, ok = c.Collations[co]
  94. return ok
  95. }
  96. // GetDefaultCollation returns the default collation for charset.
  97. func GetDefaultCollation(charset string) (string, error) {
  98. charset = strings.ToLower(charset)
  99. if charset == CharsetBin {
  100. return CollationBin, nil
  101. }
  102. c, ok := charsets[charset]
  103. if !ok {
  104. return "", errors.Errorf("Unknown charset %s", charset)
  105. }
  106. return c.DefaultCollation, nil
  107. }
  108. // GetDefaultCharsetAndCollate returns the default charset and collation.
  109. func GetDefaultCharsetAndCollate() (string, string) {
  110. return mysql.DefaultCharset, mysql.DefaultCollationName
  111. }
  112. // GetCharsetInfo returns charset and collation for cs as name.
  113. func GetCharsetInfo(cs string) (string, string, error) {
  114. c, ok := charsets[strings.ToLower(cs)]
  115. if !ok {
  116. return "", "", errors.Errorf("Unknown charset %s", cs)
  117. }
  118. return c.Name, c.DefaultCollation, nil
  119. }
  120. // GetCharsetDesc gets charset descriptions in the local charsets.
  121. func GetCharsetDesc(cs string) (*Desc, error) {
  122. switch strings.ToLower(cs) {
  123. case CharsetUTF8:
  124. return descs[0], nil
  125. case CharsetUTF8MB4:
  126. return descs[1], nil
  127. case CharsetASCII:
  128. return descs[2], nil
  129. case CharsetLatin1:
  130. return descs[3], nil
  131. case CharsetBin:
  132. return descs[4], nil
  133. default:
  134. return nil, errors.Errorf("Unknown charset %s", cs)
  135. }
  136. }
  137. // GetCharsetInfoByID returns charset and collation for id as cs_number.
  138. func GetCharsetInfoByID(coID int) (string, string, error) {
  139. if coID == mysql.DefaultCollationID {
  140. return mysql.DefaultCharset, mysql.DefaultCollationName, nil
  141. }
  142. if collation, ok := collationsIDMap[coID]; ok {
  143. return collation.CharsetName, collation.Name, nil
  144. }
  145. return "", "", errors.Errorf("Unknown charset id %d", coID)
  146. }
  147. // GetCollations returns a list for all collations.
  148. func GetCollations() []*Collation {
  149. return collations
  150. }
  151. func GetCollationByName(name string) (*Collation, error) {
  152. collation, ok := collationsNameMap[strings.ToLower(name)]
  153. if !ok {
  154. return nil, ErrUnknownCollation.GenWithStackByArgs(name)
  155. }
  156. return collation, nil
  157. }
  158. // GetCollationByID returns collations by given id.
  159. func GetCollationByID(id int) (*Collation, error) {
  160. collation, ok := collationsIDMap[id]
  161. if !ok {
  162. return nil, errors.Errorf("Unknown collation id %d", id)
  163. }
  164. return collation, nil
  165. }
  166. const (
  167. // CharsetBin is used for marking binary charset.
  168. CharsetBin = "binary"
  169. // CollationBin is the default collation for CharsetBin.
  170. CollationBin = "binary"
  171. // CharsetUTF8 is the default charset for string types.
  172. CharsetUTF8 = "utf8"
  173. // CollationUTF8 is the default collation for CharsetUTF8.
  174. CollationUTF8 = "utf8_bin"
  175. // CharsetUTF8MB4 represents 4 bytes utf8, which works the same way as utf8 in Go.
  176. CharsetUTF8MB4 = "utf8mb4"
  177. // CollationUTF8MB4 is the default collation for CharsetUTF8MB4.
  178. CollationUTF8MB4 = "utf8mb4_bin"
  179. // CharsetASCII is a subset of UTF8.
  180. CharsetASCII = "ascii"
  181. // CollationASCII is the default collation for CharsetACSII.
  182. CollationASCII = "ascii_bin"
  183. // CharsetLatin1 is a single byte charset.
  184. CharsetLatin1 = "latin1"
  185. // CollationLatin1 is the default collation for CharsetLatin1.
  186. CollationLatin1 = "latin1_bin"
  187. )
  188. var collations = []*Collation{
  189. {1, "big5", "big5_chinese_ci", true},
  190. {2, "latin2", "latin2_czech_cs", false},
  191. {3, "dec8", "dec8_swedish_ci", true},
  192. {4, "cp850", "cp850_general_ci", true},
  193. {5, "latin1", "latin1_german1_ci", false},
  194. {6, "hp8", "hp8_english_ci", true},
  195. {7, "koi8r", "koi8r_general_ci", true},
  196. {8, "latin1", "latin1_swedish_ci", false},
  197. {9, "latin2", "latin2_general_ci", true},
  198. {10, "swe7", "swe7_swedish_ci", true},
  199. {11, "ascii", "ascii_general_ci", false},
  200. {12, "ujis", "ujis_japanese_ci", true},
  201. {13, "sjis", "sjis_japanese_ci", true},
  202. {14, "cp1251", "cp1251_bulgarian_ci", false},
  203. {15, "latin1", "latin1_danish_ci", false},
  204. {16, "hebrew", "hebrew_general_ci", true},
  205. {18, "tis620", "tis620_thai_ci", true},
  206. {19, "euckr", "euckr_korean_ci", true},
  207. {20, "latin7", "latin7_estonian_cs", false},
  208. {21, "latin2", "latin2_hungarian_ci", false},
  209. {22, "koi8u", "koi8u_general_ci", true},
  210. {23, "cp1251", "cp1251_ukrainian_ci", false},
  211. {24, "gb2312", "gb2312_chinese_ci", true},
  212. {25, "greek", "greek_general_ci", true},
  213. {26, "cp1250", "cp1250_general_ci", true},
  214. {27, "latin2", "latin2_croatian_ci", false},
  215. {28, "gbk", "gbk_chinese_ci", true},
  216. {29, "cp1257", "cp1257_lithuanian_ci", false},
  217. {30, "latin5", "latin5_turkish_ci", true},
  218. {31, "latin1", "latin1_german2_ci", false},
  219. {32, "armscii8", "armscii8_general_ci", true},
  220. {33, "utf8", "utf8_general_ci", false},
  221. {34, "cp1250", "cp1250_czech_cs", false},
  222. {35, "ucs2", "ucs2_general_ci", true},
  223. {36, "cp866", "cp866_general_ci", true},
  224. {37, "keybcs2", "keybcs2_general_ci", true},
  225. {38, "macce", "macce_general_ci", true},
  226. {39, "macroman", "macroman_general_ci", true},
  227. {40, "cp852", "cp852_general_ci", true},
  228. {41, "latin7", "latin7_general_ci", true},
  229. {42, "latin7", "latin7_general_cs", false},
  230. {43, "macce", "macce_bin", false},
  231. {44, "cp1250", "cp1250_croatian_ci", false},
  232. {45, "utf8mb4", "utf8mb4_general_ci", false},
  233. {46, "utf8mb4", "utf8mb4_bin", true},
  234. {47, "latin1", "latin1_bin", true},
  235. {48, "latin1", "latin1_general_ci", false},
  236. {49, "latin1", "latin1_general_cs", false},
  237. {50, "cp1251", "cp1251_bin", false},
  238. {51, "cp1251", "cp1251_general_ci", true},
  239. {52, "cp1251", "cp1251_general_cs", false},
  240. {53, "macroman", "macroman_bin", false},
  241. {54, "utf16", "utf16_general_ci", true},
  242. {55, "utf16", "utf16_bin", false},
  243. {56, "utf16le", "utf16le_general_ci", true},
  244. {57, "cp1256", "cp1256_general_ci", true},
  245. {58, "cp1257", "cp1257_bin", false},
  246. {59, "cp1257", "cp1257_general_ci", true},
  247. {60, "utf32", "utf32_general_ci", true},
  248. {61, "utf32", "utf32_bin", false},
  249. {62, "utf16le", "utf16le_bin", false},
  250. {63, "binary", "binary", true},
  251. {64, "armscii8", "armscii8_bin", false},
  252. {65, "ascii", "ascii_bin", true},
  253. {66, "cp1250", "cp1250_bin", false},
  254. {67, "cp1256", "cp1256_bin", false},
  255. {68, "cp866", "cp866_bin", false},
  256. {69, "dec8", "dec8_bin", false},
  257. {70, "greek", "greek_bin", false},
  258. {71, "hebrew", "hebrew_bin", false},
  259. {72, "hp8", "hp8_bin", false},
  260. {73, "keybcs2", "keybcs2_bin", false},
  261. {74, "koi8r", "koi8r_bin", false},
  262. {75, "koi8u", "koi8u_bin", false},
  263. {77, "latin2", "latin2_bin", false},
  264. {78, "latin5", "latin5_bin", false},
  265. {79, "latin7", "latin7_bin", false},
  266. {80, "cp850", "cp850_bin", false},
  267. {81, "cp852", "cp852_bin", false},
  268. {82, "swe7", "swe7_bin", false},
  269. {83, "utf8", "utf8_bin", true},
  270. {84, "big5", "big5_bin", false},
  271. {85, "euckr", "euckr_bin", false},
  272. {86, "gb2312", "gb2312_bin", false},
  273. {87, "gbk", "gbk_bin", false},
  274. {88, "sjis", "sjis_bin", false},
  275. {89, "tis620", "tis620_bin", false},
  276. {90, "ucs2", "ucs2_bin", false},
  277. {91, "ujis", "ujis_bin", false},
  278. {92, "geostd8", "geostd8_general_ci", true},
  279. {93, "geostd8", "geostd8_bin", false},
  280. {94, "latin1", "latin1_spanish_ci", false},
  281. {95, "cp932", "cp932_japanese_ci", true},
  282. {96, "cp932", "cp932_bin", false},
  283. {97, "eucjpms", "eucjpms_japanese_ci", true},
  284. {98, "eucjpms", "eucjpms_bin", false},
  285. {99, "cp1250", "cp1250_polish_ci", false},
  286. {101, "utf16", "utf16_unicode_ci", false},
  287. {102, "utf16", "utf16_icelandic_ci", false},
  288. {103, "utf16", "utf16_latvian_ci", false},
  289. {104, "utf16", "utf16_romanian_ci", false},
  290. {105, "utf16", "utf16_slovenian_ci", false},
  291. {106, "utf16", "utf16_polish_ci", false},
  292. {107, "utf16", "utf16_estonian_ci", false},
  293. {108, "utf16", "utf16_spanish_ci", false},
  294. {109, "utf16", "utf16_swedish_ci", false},
  295. {110, "utf16", "utf16_turkish_ci", false},
  296. {111, "utf16", "utf16_czech_ci", false},
  297. {112, "utf16", "utf16_danish_ci", false},
  298. {113, "utf16", "utf16_lithuanian_ci", false},
  299. {114, "utf16", "utf16_slovak_ci", false},
  300. {115, "utf16", "utf16_spanish2_ci", false},
  301. {116, "utf16", "utf16_roman_ci", false},
  302. {117, "utf16", "utf16_persian_ci", false},
  303. {118, "utf16", "utf16_esperanto_ci", false},
  304. {119, "utf16", "utf16_hungarian_ci", false},
  305. {120, "utf16", "utf16_sinhala_ci", false},
  306. {121, "utf16", "utf16_german2_ci", false},
  307. {122, "utf16", "utf16_croatian_ci", false},
  308. {123, "utf16", "utf16_unicode_520_ci", false},
  309. {124, "utf16", "utf16_vietnamese_ci", false},
  310. {128, "ucs2", "ucs2_unicode_ci", false},
  311. {129, "ucs2", "ucs2_icelandic_ci", false},
  312. {130, "ucs2", "ucs2_latvian_ci", false},
  313. {131, "ucs2", "ucs2_romanian_ci", false},
  314. {132, "ucs2", "ucs2_slovenian_ci", false},
  315. {133, "ucs2", "ucs2_polish_ci", false},
  316. {134, "ucs2", "ucs2_estonian_ci", false},
  317. {135, "ucs2", "ucs2_spanish_ci", false},
  318. {136, "ucs2", "ucs2_swedish_ci", false},
  319. {137, "ucs2", "ucs2_turkish_ci", false},
  320. {138, "ucs2", "ucs2_czech_ci", false},
  321. {139, "ucs2", "ucs2_danish_ci", false},
  322. {140, "ucs2", "ucs2_lithuanian_ci", false},
  323. {141, "ucs2", "ucs2_slovak_ci", false},
  324. {142, "ucs2", "ucs2_spanish2_ci", false},
  325. {143, "ucs2", "ucs2_roman_ci", false},
  326. {144, "ucs2", "ucs2_persian_ci", false},
  327. {145, "ucs2", "ucs2_esperanto_ci", false},
  328. {146, "ucs2", "ucs2_hungarian_ci", false},
  329. {147, "ucs2", "ucs2_sinhala_ci", false},
  330. {148, "ucs2", "ucs2_german2_ci", false},
  331. {149, "ucs2", "ucs2_croatian_ci", false},
  332. {150, "ucs2", "ucs2_unicode_520_ci", false},
  333. {151, "ucs2", "ucs2_vietnamese_ci", false},
  334. {159, "ucs2", "ucs2_general_mysql500_ci", false},
  335. {160, "utf32", "utf32_unicode_ci", false},
  336. {161, "utf32", "utf32_icelandic_ci", false},
  337. {162, "utf32", "utf32_latvian_ci", false},
  338. {163, "utf32", "utf32_romanian_ci", false},
  339. {164, "utf32", "utf32_slovenian_ci", false},
  340. {165, "utf32", "utf32_polish_ci", false},
  341. {166, "utf32", "utf32_estonian_ci", false},
  342. {167, "utf32", "utf32_spanish_ci", false},
  343. {168, "utf32", "utf32_swedish_ci", false},
  344. {169, "utf32", "utf32_turkish_ci", false},
  345. {170, "utf32", "utf32_czech_ci", false},
  346. {171, "utf32", "utf32_danish_ci", false},
  347. {172, "utf32", "utf32_lithuanian_ci", false},
  348. {173, "utf32", "utf32_slovak_ci", false},
  349. {174, "utf32", "utf32_spanish2_ci", false},
  350. {175, "utf32", "utf32_roman_ci", false},
  351. {176, "utf32", "utf32_persian_ci", false},
  352. {177, "utf32", "utf32_esperanto_ci", false},
  353. {178, "utf32", "utf32_hungarian_ci", false},
  354. {179, "utf32", "utf32_sinhala_ci", false},
  355. {180, "utf32", "utf32_german2_ci", false},
  356. {181, "utf32", "utf32_croatian_ci", false},
  357. {182, "utf32", "utf32_unicode_520_ci", false},
  358. {183, "utf32", "utf32_vietnamese_ci", false},
  359. {192, "utf8", "utf8_unicode_ci", false},
  360. {193, "utf8", "utf8_icelandic_ci", false},
  361. {194, "utf8", "utf8_latvian_ci", false},
  362. {195, "utf8", "utf8_romanian_ci", false},
  363. {196, "utf8", "utf8_slovenian_ci", false},
  364. {197, "utf8", "utf8_polish_ci", false},
  365. {198, "utf8", "utf8_estonian_ci", false},
  366. {199, "utf8", "utf8_spanish_ci", false},
  367. {200, "utf8", "utf8_swedish_ci", false},
  368. {201, "utf8", "utf8_turkish_ci", false},
  369. {202, "utf8", "utf8_czech_ci", false},
  370. {203, "utf8", "utf8_danish_ci", false},
  371. {204, "utf8", "utf8_lithuanian_ci", false},
  372. {205, "utf8", "utf8_slovak_ci", false},
  373. {206, "utf8", "utf8_spanish2_ci", false},
  374. {207, "utf8", "utf8_roman_ci", false},
  375. {208, "utf8", "utf8_persian_ci", false},
  376. {209, "utf8", "utf8_esperanto_ci", false},
  377. {210, "utf8", "utf8_hungarian_ci", false},
  378. {211, "utf8", "utf8_sinhala_ci", false},
  379. {212, "utf8", "utf8_german2_ci", false},
  380. {213, "utf8", "utf8_croatian_ci", false},
  381. {214, "utf8", "utf8_unicode_520_ci", false},
  382. {215, "utf8", "utf8_vietnamese_ci", false},
  383. {223, "utf8", "utf8_general_mysql500_ci", false},
  384. {224, "utf8mb4", "utf8mb4_unicode_ci", false},
  385. {225, "utf8mb4", "utf8mb4_icelandic_ci", false},
  386. {226, "utf8mb4", "utf8mb4_latvian_ci", false},
  387. {227, "utf8mb4", "utf8mb4_romanian_ci", false},
  388. {228, "utf8mb4", "utf8mb4_slovenian_ci", false},
  389. {229, "utf8mb4", "utf8mb4_polish_ci", false},
  390. {230, "utf8mb4", "utf8mb4_estonian_ci", false},
  391. {231, "utf8mb4", "utf8mb4_spanish_ci", false},
  392. {232, "utf8mb4", "utf8mb4_swedish_ci", false},
  393. {233, "utf8mb4", "utf8mb4_turkish_ci", false},
  394. {234, "utf8mb4", "utf8mb4_czech_ci", false},
  395. {235, "utf8mb4", "utf8mb4_danish_ci", false},
  396. {236, "utf8mb4", "utf8mb4_lithuanian_ci", false},
  397. {237, "utf8mb4", "utf8mb4_slovak_ci", false},
  398. {238, "utf8mb4", "utf8mb4_spanish2_ci", false},
  399. {239, "utf8mb4", "utf8mb4_roman_ci", false},
  400. {240, "utf8mb4", "utf8mb4_persian_ci", false},
  401. {241, "utf8mb4", "utf8mb4_esperanto_ci", false},
  402. {242, "utf8mb4", "utf8mb4_hungarian_ci", false},
  403. {243, "utf8mb4", "utf8mb4_sinhala_ci", false},
  404. {244, "utf8mb4", "utf8mb4_german2_ci", false},
  405. {245, "utf8mb4", "utf8mb4_croatian_ci", false},
  406. {246, "utf8mb4", "utf8mb4_unicode_520_ci", false},
  407. {247, "utf8mb4", "utf8mb4_vietnamese_ci", false},
  408. {255, "utf8mb4", "utf8mb4_0900_ai_ci", false},
  409. {2048, "utf8mb4", "utf8mb4_zh_pinyin_tidb_as_cs", false},
  410. }
  411. // init method always puts to the end of file.
  412. func init() {
  413. for _, c := range charsetInfos {
  414. charsets[c.Name] = c
  415. desc := &Desc{
  416. Name: c.Name,
  417. DefaultCollation: c.DefaultCollation,
  418. Desc: c.Desc,
  419. Maxlen: c.Maxlen,
  420. }
  421. descs = append(descs, desc)
  422. }
  423. for _, c := range collations {
  424. collationsIDMap[c.ID] = c
  425. collationsNameMap[c.Name] = c
  426. if _, ok := supportedCollationNames[c.Name]; ok {
  427. supportedCollations = append(supportedCollations, c)
  428. }
  429. if charset, ok := charsets[c.CharsetName]; ok {
  430. charset.Collations[c.Name] = c
  431. }
  432. }
  433. }