unicode.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512
  1. // Copyright 2013 The Go Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style
  3. // license that can be found in the LICENSE file.
  4. // Package unicode provides Unicode encodings such as UTF-16.
  5. package unicode // import "golang.org/x/text/encoding/unicode"
  6. import (
  7. "bytes"
  8. "errors"
  9. "unicode/utf16"
  10. "unicode/utf8"
  11. "golang.org/x/text/encoding"
  12. "golang.org/x/text/encoding/internal"
  13. "golang.org/x/text/encoding/internal/identifier"
  14. "golang.org/x/text/internal/utf8internal"
  15. "golang.org/x/text/runes"
  16. "golang.org/x/text/transform"
  17. )
  18. // TODO: I think the Transformers really should return errors on unmatched
  19. // surrogate pairs and odd numbers of bytes. This is not required by RFC 2781,
  20. // which leaves it open, but is suggested by WhatWG. It will allow for all error
  21. // modes as defined by WhatWG: fatal, HTML and Replacement. This would require
  22. // the introduction of some kind of error type for conveying the erroneous code
  23. // point.
  24. // UTF8 is the UTF-8 encoding. It neither removes nor adds byte order marks.
  25. var UTF8 encoding.Encoding = utf8enc
  26. // UTF8BOM is an UTF-8 encoding where the decoder strips a leading byte order
  27. // mark while the encoder adds one.
  28. //
  29. // Some editors add a byte order mark as a signature to UTF-8 files. Although
  30. // the byte order mark is not useful for detecting byte order in UTF-8, it is
  31. // sometimes used as a convention to mark UTF-8-encoded files. This relies on
  32. // the observation that the UTF-8 byte order mark is either an illegal or at
  33. // least very unlikely sequence in any other character encoding.
  34. var UTF8BOM encoding.Encoding = utf8bomEncoding{}
  35. type utf8bomEncoding struct{}
  36. func (utf8bomEncoding) String() string {
  37. return "UTF-8-BOM"
  38. }
  39. func (utf8bomEncoding) ID() (identifier.MIB, string) {
  40. return identifier.Unofficial, "x-utf8bom"
  41. }
  42. func (utf8bomEncoding) NewEncoder() *encoding.Encoder {
  43. return &encoding.Encoder{
  44. Transformer: &utf8bomEncoder{t: runes.ReplaceIllFormed()},
  45. }
  46. }
  47. func (utf8bomEncoding) NewDecoder() *encoding.Decoder {
  48. return &encoding.Decoder{Transformer: &utf8bomDecoder{}}
  49. }
  50. var utf8enc = &internal.Encoding{
  51. &internal.SimpleEncoding{utf8Decoder{}, runes.ReplaceIllFormed()},
  52. "UTF-8",
  53. identifier.UTF8,
  54. }
  55. type utf8bomDecoder struct {
  56. checked bool
  57. }
  58. func (t *utf8bomDecoder) Reset() {
  59. t.checked = false
  60. }
  61. func (t *utf8bomDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  62. if !t.checked {
  63. if !atEOF && len(src) < len(utf8BOM) {
  64. if len(src) == 0 {
  65. return 0, 0, nil
  66. }
  67. return 0, 0, transform.ErrShortSrc
  68. }
  69. if bytes.HasPrefix(src, []byte(utf8BOM)) {
  70. nSrc += len(utf8BOM)
  71. src = src[len(utf8BOM):]
  72. }
  73. t.checked = true
  74. }
  75. nDst, n, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)
  76. nSrc += n
  77. return nDst, nSrc, err
  78. }
  79. type utf8bomEncoder struct {
  80. written bool
  81. t transform.Transformer
  82. }
  83. func (t *utf8bomEncoder) Reset() {
  84. t.written = false
  85. t.t.Reset()
  86. }
  87. func (t *utf8bomEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  88. if !t.written {
  89. if len(dst) < len(utf8BOM) {
  90. return nDst, 0, transform.ErrShortDst
  91. }
  92. nDst = copy(dst, utf8BOM)
  93. t.written = true
  94. }
  95. n, nSrc, err := utf8Decoder.Transform(utf8Decoder{}, dst[nDst:], src, atEOF)
  96. nDst += n
  97. return nDst, nSrc, err
  98. }
  99. type utf8Decoder struct{ transform.NopResetter }
  100. func (utf8Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  101. var pSrc int // point from which to start copy in src
  102. var accept utf8internal.AcceptRange
  103. // The decoder can only make the input larger, not smaller.
  104. n := len(src)
  105. if len(dst) < n {
  106. err = transform.ErrShortDst
  107. n = len(dst)
  108. atEOF = false
  109. }
  110. for nSrc < n {
  111. c := src[nSrc]
  112. if c < utf8.RuneSelf {
  113. nSrc++
  114. continue
  115. }
  116. first := utf8internal.First[c]
  117. size := int(first & utf8internal.SizeMask)
  118. if first == utf8internal.FirstInvalid {
  119. goto handleInvalid // invalid starter byte
  120. }
  121. accept = utf8internal.AcceptRanges[first>>utf8internal.AcceptShift]
  122. if nSrc+size > n {
  123. if !atEOF {
  124. // We may stop earlier than necessary here if the short sequence
  125. // has invalid bytes. Not checking for this simplifies the code
  126. // and may avoid duplicate computations in certain conditions.
  127. if err == nil {
  128. err = transform.ErrShortSrc
  129. }
  130. break
  131. }
  132. // Determine the maximal subpart of an ill-formed subsequence.
  133. switch {
  134. case nSrc+1 >= n || src[nSrc+1] < accept.Lo || accept.Hi < src[nSrc+1]:
  135. size = 1
  136. case nSrc+2 >= n || src[nSrc+2] < utf8internal.LoCB || utf8internal.HiCB < src[nSrc+2]:
  137. size = 2
  138. default:
  139. size = 3 // As we are short, the maximum is 3.
  140. }
  141. goto handleInvalid
  142. }
  143. if c = src[nSrc+1]; c < accept.Lo || accept.Hi < c {
  144. size = 1
  145. goto handleInvalid // invalid continuation byte
  146. } else if size == 2 {
  147. } else if c = src[nSrc+2]; c < utf8internal.LoCB || utf8internal.HiCB < c {
  148. size = 2
  149. goto handleInvalid // invalid continuation byte
  150. } else if size == 3 {
  151. } else if c = src[nSrc+3]; c < utf8internal.LoCB || utf8internal.HiCB < c {
  152. size = 3
  153. goto handleInvalid // invalid continuation byte
  154. }
  155. nSrc += size
  156. continue
  157. handleInvalid:
  158. // Copy the scanned input so far.
  159. nDst += copy(dst[nDst:], src[pSrc:nSrc])
  160. // Append RuneError to the destination.
  161. const runeError = "\ufffd"
  162. if nDst+len(runeError) > len(dst) {
  163. return nDst, nSrc, transform.ErrShortDst
  164. }
  165. nDst += copy(dst[nDst:], runeError)
  166. // Skip the maximal subpart of an ill-formed subsequence according to
  167. // the W3C standard way instead of the Go way. This Transform is
  168. // probably the only place in the text repo where it is warranted.
  169. nSrc += size
  170. pSrc = nSrc
  171. // Recompute the maximum source length.
  172. if sz := len(dst) - nDst; sz < len(src)-nSrc {
  173. err = transform.ErrShortDst
  174. n = nSrc + sz
  175. atEOF = false
  176. }
  177. }
  178. return nDst + copy(dst[nDst:], src[pSrc:nSrc]), nSrc, err
  179. }
  180. // UTF16 returns a UTF-16 Encoding for the given default endianness and byte
  181. // order mark (BOM) policy.
  182. //
  183. // When decoding from UTF-16 to UTF-8, if the BOMPolicy is IgnoreBOM then
  184. // neither BOMs U+FEFF nor noncharacters U+FFFE in the input stream will affect
  185. // the endianness used for decoding, and will instead be output as their
  186. // standard UTF-8 encodings: "\xef\xbb\xbf" and "\xef\xbf\xbe". If the BOMPolicy
  187. // is UseBOM or ExpectBOM a staring BOM is not written to the UTF-8 output.
  188. // Instead, it overrides the default endianness e for the remainder of the
  189. // transformation. Any subsequent BOMs U+FEFF or noncharacters U+FFFE will not
  190. // affect the endianness used, and will instead be output as their standard
  191. // UTF-8 encodings. For UseBOM, if there is no starting BOM, it will proceed
  192. // with the default Endianness. For ExpectBOM, in that case, the transformation
  193. // will return early with an ErrMissingBOM error.
  194. //
  195. // When encoding from UTF-8 to UTF-16, a BOM will be inserted at the start of
  196. // the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM will not
  197. // be inserted. The UTF-8 input does not need to contain a BOM.
  198. //
  199. // There is no concept of a 'native' endianness. If the UTF-16 data is produced
  200. // and consumed in a greater context that implies a certain endianness, use
  201. // IgnoreBOM. Otherwise, use ExpectBOM and always produce and consume a BOM.
  202. //
  203. // In the language of https://www.unicode.org/faq/utf_bom.html#bom10, IgnoreBOM
  204. // corresponds to "Where the precise type of the data stream is known... the
  205. // BOM should not be used" and ExpectBOM corresponds to "A particular
  206. // protocol... may require use of the BOM".
  207. func UTF16(e Endianness, b BOMPolicy) encoding.Encoding {
  208. return utf16Encoding{config{e, b}, mibValue[e][b&bomMask]}
  209. }
  210. // mibValue maps Endianness and BOMPolicy settings to MIB constants. Note that
  211. // some configurations map to the same MIB identifier. RFC 2781 has requirements
  212. // and recommendations. Some of the "configurations" are merely recommendations,
  213. // so multiple configurations could match.
  214. var mibValue = map[Endianness][numBOMValues]identifier.MIB{
  215. BigEndian: [numBOMValues]identifier.MIB{
  216. IgnoreBOM: identifier.UTF16BE,
  217. UseBOM: identifier.UTF16, // BigEnding default is preferred by RFC 2781.
  218. // TODO: acceptBOM | strictBOM would map to UTF16BE as well.
  219. },
  220. LittleEndian: [numBOMValues]identifier.MIB{
  221. IgnoreBOM: identifier.UTF16LE,
  222. UseBOM: identifier.UTF16, // LittleEndian default is allowed and preferred on Windows.
  223. // TODO: acceptBOM | strictBOM would map to UTF16LE as well.
  224. },
  225. // ExpectBOM is not widely used and has no valid MIB identifier.
  226. }
  227. // All lists a configuration for each IANA-defined UTF-16 variant.
  228. var All = []encoding.Encoding{
  229. UTF8,
  230. UTF16(BigEndian, UseBOM),
  231. UTF16(BigEndian, IgnoreBOM),
  232. UTF16(LittleEndian, IgnoreBOM),
  233. }
  234. // BOMPolicy is a UTF-16 encoding's byte order mark policy.
  235. type BOMPolicy uint8
  236. const (
  237. writeBOM BOMPolicy = 0x01
  238. acceptBOM BOMPolicy = 0x02
  239. requireBOM BOMPolicy = 0x04
  240. bomMask BOMPolicy = 0x07
  241. // HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a
  242. // map of an array of length 8 of a type that is also used as a key or value
  243. // in another map). See golang.org/issue/11354.
  244. // TODO: consider changing this value back to 8 if the use of 1.4.* has
  245. // been minimized.
  246. numBOMValues = 8 + 1
  247. // IgnoreBOM means to ignore any byte order marks.
  248. IgnoreBOM BOMPolicy = 0
  249. // Common and RFC 2781-compliant interpretation for UTF-16BE/LE.
  250. // UseBOM means that the UTF-16 form may start with a byte order mark, which
  251. // will be used to override the default encoding.
  252. UseBOM BOMPolicy = writeBOM | acceptBOM
  253. // Common and RFC 2781-compliant interpretation for UTF-16.
  254. // ExpectBOM means that the UTF-16 form must start with a byte order mark,
  255. // which will be used to override the default encoding.
  256. ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM
  257. // Used in Java as Unicode (not to be confused with Java's UTF-16) and
  258. // ICU's UTF-16,version=1. Not compliant with RFC 2781.
  259. // TODO (maybe): strictBOM: BOM must match Endianness. This would allow:
  260. // - UTF-16(B|L)E,version=1: writeBOM | acceptBOM | requireBOM | strictBOM
  261. // (UnicodeBig and UnicodeLittle in Java)
  262. // - RFC 2781-compliant, but less common interpretation for UTF-16(B|L)E:
  263. // acceptBOM | strictBOM (e.g. assigned to CheckBOM).
  264. // This addition would be consistent with supporting ExpectBOM.
  265. )
  266. // Endianness is a UTF-16 encoding's default endianness.
  267. type Endianness bool
  268. const (
  269. // BigEndian is UTF-16BE.
  270. BigEndian Endianness = false
  271. // LittleEndian is UTF-16LE.
  272. LittleEndian Endianness = true
  273. )
  274. // ErrMissingBOM means that decoding UTF-16 input with ExpectBOM did not find a
  275. // starting byte order mark.
  276. var ErrMissingBOM = errors.New("encoding: missing byte order mark")
  277. type utf16Encoding struct {
  278. config
  279. mib identifier.MIB
  280. }
  281. type config struct {
  282. endianness Endianness
  283. bomPolicy BOMPolicy
  284. }
  285. func (u utf16Encoding) NewDecoder() *encoding.Decoder {
  286. return &encoding.Decoder{Transformer: &utf16Decoder{
  287. initial: u.config,
  288. current: u.config,
  289. }}
  290. }
  291. func (u utf16Encoding) NewEncoder() *encoding.Encoder {
  292. return &encoding.Encoder{Transformer: &utf16Encoder{
  293. endianness: u.endianness,
  294. initialBOMPolicy: u.bomPolicy,
  295. currentBOMPolicy: u.bomPolicy,
  296. }}
  297. }
  298. func (u utf16Encoding) ID() (mib identifier.MIB, other string) {
  299. return u.mib, ""
  300. }
  301. func (u utf16Encoding) String() string {
  302. e, b := "B", ""
  303. if u.endianness == LittleEndian {
  304. e = "L"
  305. }
  306. switch u.bomPolicy {
  307. case ExpectBOM:
  308. b = "Expect"
  309. case UseBOM:
  310. b = "Use"
  311. case IgnoreBOM:
  312. b = "Ignore"
  313. }
  314. return "UTF-16" + e + "E (" + b + " BOM)"
  315. }
  316. type utf16Decoder struct {
  317. initial config
  318. current config
  319. }
  320. func (u *utf16Decoder) Reset() {
  321. u.current = u.initial
  322. }
  323. func (u *utf16Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  324. if len(src) < 2 && atEOF && u.current.bomPolicy&requireBOM != 0 {
  325. return 0, 0, ErrMissingBOM
  326. }
  327. if len(src) == 0 {
  328. return 0, 0, nil
  329. }
  330. if len(src) >= 2 && u.current.bomPolicy&acceptBOM != 0 {
  331. switch {
  332. case src[0] == 0xfe && src[1] == 0xff:
  333. u.current.endianness = BigEndian
  334. nSrc = 2
  335. case src[0] == 0xff && src[1] == 0xfe:
  336. u.current.endianness = LittleEndian
  337. nSrc = 2
  338. default:
  339. if u.current.bomPolicy&requireBOM != 0 {
  340. return 0, 0, ErrMissingBOM
  341. }
  342. }
  343. u.current.bomPolicy = IgnoreBOM
  344. }
  345. var r rune
  346. var dSize, sSize int
  347. for nSrc < len(src) {
  348. if nSrc+1 < len(src) {
  349. x := uint16(src[nSrc+0])<<8 | uint16(src[nSrc+1])
  350. if u.current.endianness == LittleEndian {
  351. x = x>>8 | x<<8
  352. }
  353. r, sSize = rune(x), 2
  354. if utf16.IsSurrogate(r) {
  355. if nSrc+3 < len(src) {
  356. x = uint16(src[nSrc+2])<<8 | uint16(src[nSrc+3])
  357. if u.current.endianness == LittleEndian {
  358. x = x>>8 | x<<8
  359. }
  360. // Save for next iteration if it is not a high surrogate.
  361. if isHighSurrogate(rune(x)) {
  362. r, sSize = utf16.DecodeRune(r, rune(x)), 4
  363. }
  364. } else if !atEOF {
  365. err = transform.ErrShortSrc
  366. break
  367. }
  368. }
  369. if dSize = utf8.RuneLen(r); dSize < 0 {
  370. r, dSize = utf8.RuneError, 3
  371. }
  372. } else if atEOF {
  373. // Single trailing byte.
  374. r, dSize, sSize = utf8.RuneError, 3, 1
  375. } else {
  376. err = transform.ErrShortSrc
  377. break
  378. }
  379. if nDst+dSize > len(dst) {
  380. err = transform.ErrShortDst
  381. break
  382. }
  383. nDst += utf8.EncodeRune(dst[nDst:], r)
  384. nSrc += sSize
  385. }
  386. return nDst, nSrc, err
  387. }
  388. func isHighSurrogate(r rune) bool {
  389. return 0xDC00 <= r && r <= 0xDFFF
  390. }
  391. type utf16Encoder struct {
  392. endianness Endianness
  393. initialBOMPolicy BOMPolicy
  394. currentBOMPolicy BOMPolicy
  395. }
  396. func (u *utf16Encoder) Reset() {
  397. u.currentBOMPolicy = u.initialBOMPolicy
  398. }
  399. func (u *utf16Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  400. if u.currentBOMPolicy&writeBOM != 0 {
  401. if len(dst) < 2 {
  402. return 0, 0, transform.ErrShortDst
  403. }
  404. dst[0], dst[1] = 0xfe, 0xff
  405. u.currentBOMPolicy = IgnoreBOM
  406. nDst = 2
  407. }
  408. r, size := rune(0), 0
  409. for nSrc < len(src) {
  410. r = rune(src[nSrc])
  411. // Decode a 1-byte rune.
  412. if r < utf8.RuneSelf {
  413. size = 1
  414. } else {
  415. // Decode a multi-byte rune.
  416. r, size = utf8.DecodeRune(src[nSrc:])
  417. if size == 1 {
  418. // All valid runes of size 1 (those below utf8.RuneSelf) were
  419. // handled above. We have invalid UTF-8 or we haven't seen the
  420. // full character yet.
  421. if !atEOF && !utf8.FullRune(src[nSrc:]) {
  422. err = transform.ErrShortSrc
  423. break
  424. }
  425. }
  426. }
  427. if r <= 0xffff {
  428. if nDst+2 > len(dst) {
  429. err = transform.ErrShortDst
  430. break
  431. }
  432. dst[nDst+0] = uint8(r >> 8)
  433. dst[nDst+1] = uint8(r)
  434. nDst += 2
  435. } else {
  436. if nDst+4 > len(dst) {
  437. err = transform.ErrShortDst
  438. break
  439. }
  440. r1, r2 := utf16.EncodeRune(r)
  441. dst[nDst+0] = uint8(r1 >> 8)
  442. dst[nDst+1] = uint8(r1)
  443. dst[nDst+2] = uint8(r2 >> 8)
  444. dst[nDst+3] = uint8(r2)
  445. nDst += 4
  446. }
  447. nSrc += size
  448. }
  449. if u.endianness == LittleEndian {
  450. for i := 0; i < nDst; i += 2 {
  451. dst[i], dst[i+1] = dst[i+1], dst[i]
  452. }
  453. }
  454. return nDst, nSrc, err
  455. }