utf8.go 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. package wsutil
  2. import (
  3. "fmt"
  4. "io"
  5. )
  6. // ErrInvalidUTF8 is returned by UTF8 reader on invalid utf8 sequence.
  7. var ErrInvalidUTF8 = fmt.Errorf("invalid utf8")
  8. // UTF8Reader implements io.Reader that calculates utf8 validity state after
  9. // every read byte from Source.
  10. //
  11. // Note that in some cases client must call r.Valid() after all bytes are read
  12. // to ensure that all of them are valid utf8 sequences. That is, some io helper
  13. // functions such io.ReadAtLeast or io.ReadFull could discard the error
  14. // information returned by the reader when they receive all of requested bytes.
  15. // For example, the last read sequence is invalid and UTF8Reader returns number
  16. // of bytes read and an error. But helper function decides to discard received
  17. // error due to all requested bytes are completely read from the source.
  18. //
  19. // Another possible case is when some valid sequence become split by the read
  20. // bound. Then UTF8Reader can not make decision about validity of the last
  21. // sequence cause it is not fully read yet. And if the read stops, Valid() will
  22. // return false, even if Read() by itself dit not.
  23. type UTF8Reader struct {
  24. Source io.Reader
  25. accepted int
  26. state uint32
  27. codep uint32
  28. }
  29. // NewUTF8Reader creates utf8 reader that reads from r.
  30. func NewUTF8Reader(r io.Reader) *UTF8Reader {
  31. return &UTF8Reader{
  32. Source: r,
  33. }
  34. }
  35. // Reset resets utf8 reader to read from r.
  36. func (u *UTF8Reader) Reset(r io.Reader) {
  37. u.Source = r
  38. u.state = 0
  39. u.codep = 0
  40. }
  41. // Read implements io.Reader.
  42. func (u *UTF8Reader) Read(p []byte) (n int, err error) {
  43. n, err = u.Source.Read(p)
  44. accepted := 0
  45. s, c := u.state, u.codep
  46. for i := 0; i < n; i++ {
  47. c, s = decode(s, c, p[i])
  48. if s == utf8Reject {
  49. u.state = s
  50. return accepted, ErrInvalidUTF8
  51. }
  52. if s == utf8Accept {
  53. accepted = i + 1
  54. }
  55. }
  56. u.state, u.codep = s, c
  57. u.accepted = accepted
  58. return n, err
  59. }
  60. // Valid checks current reader state. It returns true if all read bytes are
  61. // valid UTF-8 sequences, and false if not.
  62. func (u *UTF8Reader) Valid() bool {
  63. return u.state == utf8Accept
  64. }
  65. // Accepted returns number of valid bytes in last Read().
  66. func (u *UTF8Reader) Accepted() int {
  67. return u.accepted
  68. }
  69. // Below is port of UTF-8 decoder from http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
  70. //
  71. // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
  72. //
  73. // Permission is hereby granted, free of charge, to any person obtaining a copy
  74. // of this software and associated documentation files (the "Software"), to
  75. // deal in the Software without restriction, including without limitation the
  76. // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  77. // sell copies of the Software, and to permit persons to whom the Software is
  78. // furnished to do so, subject to the following conditions:
  79. //
  80. // The above copyright notice and this permission notice shall be included in
  81. // all copies or substantial portions of the Software.
  82. //
  83. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  84. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  85. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  86. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  87. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  88. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  89. // IN THE SOFTWARE.
  90. const (
  91. utf8Accept = 0
  92. utf8Reject = 12
  93. )
  94. var utf8d = [...]byte{
  95. // The first part of the table maps bytes to character classes that
  96. // to reduce the size of the transition table and create bitmasks.
  97. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  98. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  99. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  100. 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  101. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
  102. 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
  103. 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  104. 10, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
  105. // The second part is a transition table that maps a combination
  106. // of a state of the automaton and a character class to a state.
  107. 0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
  108. 12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 24, 12, 12,
  109. 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12,
  110. 12, 12, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12,
  111. 12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
  112. }
  113. func decode(state, codep uint32, b byte) (uint32, uint32) {
  114. t := uint32(utf8d[b])
  115. if state != utf8Accept {
  116. codep = (uint32(b) & 0x3f) | (codep << 6)
  117. } else {
  118. codep = (0xff >> t) & uint32(b)
  119. }
  120. return codep, uint32(utf8d[256+state+t])
  121. }