grapheme.go 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. package uniseg
  2. // The states of the grapheme cluster parser.
  3. const (
  4. grAny = iota
  5. grCR
  6. grControlLF
  7. grL
  8. grLVV
  9. grLVTT
  10. grPrepend
  11. grExtendedPictographic
  12. grExtendedPictographicZWJ
  13. grRIOdd
  14. grRIEven
  15. )
  16. // The grapheme cluster parser's breaking instructions.
  17. const (
  18. grNoBoundary = iota
  19. grBoundary
  20. )
  21. // The grapheme cluster parser's state transitions. Maps (state, property) to
  22. // (new state, breaking instruction, rule number). The breaking instruction
  23. // always refers to the boundary between the last and next code point.
  24. //
  25. // This map is queried as follows:
  26. //
  27. // 1. Find specific state + specific property. Stop if found.
  28. // 2. Find specific state + any property.
  29. // 3. Find any state + specific property.
  30. // 4. If only (2) or (3) (but not both) was found, stop.
  31. // 5. If both (2) and (3) were found, use state and breaking instruction from
  32. // the transition with the lower rule number, prefer (3) if rule numbers
  33. // are equal. Stop.
  34. // 6. Assume grAny and grBoundary.
  35. var grTransitions = map[[2]int][3]int{
  36. // GB5
  37. {grAny, prCR}: {grCR, grBoundary, 50},
  38. {grAny, prLF}: {grControlLF, grBoundary, 50},
  39. {grAny, prControl}: {grControlLF, grBoundary, 50},
  40. // GB4
  41. {grCR, prAny}: {grAny, grBoundary, 40},
  42. {grControlLF, prAny}: {grAny, grBoundary, 40},
  43. // GB3.
  44. {grCR, prLF}: {grAny, grNoBoundary, 30},
  45. // GB6.
  46. {grAny, prL}: {grL, grBoundary, 9990},
  47. {grL, prL}: {grL, grNoBoundary, 60},
  48. {grL, prV}: {grLVV, grNoBoundary, 60},
  49. {grL, prLV}: {grLVV, grNoBoundary, 60},
  50. {grL, prLVT}: {grLVTT, grNoBoundary, 60},
  51. // GB7.
  52. {grAny, prLV}: {grLVV, grBoundary, 9990},
  53. {grAny, prV}: {grLVV, grBoundary, 9990},
  54. {grLVV, prV}: {grLVV, grNoBoundary, 70},
  55. {grLVV, prT}: {grLVTT, grNoBoundary, 70},
  56. // GB8.
  57. {grAny, prLVT}: {grLVTT, grBoundary, 9990},
  58. {grAny, prT}: {grLVTT, grBoundary, 9990},
  59. {grLVTT, prT}: {grLVTT, grNoBoundary, 80},
  60. // GB9.
  61. {grAny, prExtend}: {grAny, grNoBoundary, 90},
  62. {grAny, prZWJ}: {grAny, grNoBoundary, 90},
  63. // GB9a.
  64. {grAny, prSpacingMark}: {grAny, grNoBoundary, 91},
  65. // GB9b.
  66. {grAny, prPreprend}: {grPrepend, grBoundary, 9990},
  67. {grPrepend, prAny}: {grAny, grNoBoundary, 92},
  68. // GB11.
  69. {grAny, prExtendedPictographic}: {grExtendedPictographic, grBoundary, 9990},
  70. {grExtendedPictographic, prExtend}: {grExtendedPictographic, grNoBoundary, 110},
  71. {grExtendedPictographic, prZWJ}: {grExtendedPictographicZWJ, grNoBoundary, 110},
  72. {grExtendedPictographicZWJ, prExtendedPictographic}: {grExtendedPictographic, grNoBoundary, 110},
  73. // GB12 / GB13.
  74. {grAny, prRegionalIndicator}: {grRIOdd, grBoundary, 9990},
  75. {grRIOdd, prRegionalIndicator}: {grRIEven, grNoBoundary, 120},
  76. {grRIEven, prRegionalIndicator}: {grRIOdd, grBoundary, 120},
  77. }
  78. // Graphemes implements an iterator over Unicode extended grapheme clusters,
  79. // specified in the Unicode Standard Annex #29. Grapheme clusters correspond to
  80. // "user-perceived characters". These characters often consist of multiple
  81. // code points (e.g. the "woman kissing woman" emoji consists of 8 code points:
  82. // woman + ZWJ + heavy black heart (2 code points) + ZWJ + kiss mark + ZWJ +
  83. // woman) and the rules described in Annex #29 must be applied to group those
  84. // code points into clusters perceived by the user as one character.
  85. type Graphemes struct {
  86. // The code points over which this class iterates.
  87. codePoints []rune
  88. // The (byte-based) indices of the code points into the original string plus
  89. // len(original string). Thus, len(indices) = len(codePoints) + 1.
  90. indices []int
  91. // The current grapheme cluster to be returned. These are indices into
  92. // codePoints/indices. If start == end, we either haven't started iterating
  93. // yet (0) or the iteration has already completed (1).
  94. start, end int
  95. // The index of the next code point to be parsed.
  96. pos int
  97. // The current state of the code point parser.
  98. state int
  99. }
  100. // NewGraphemes returns a new grapheme cluster iterator.
  101. func NewGraphemes(s string) *Graphemes {
  102. g := &Graphemes{}
  103. for index, codePoint := range s {
  104. g.codePoints = append(g.codePoints, codePoint)
  105. g.indices = append(g.indices, index)
  106. }
  107. g.indices = append(g.indices, len(s))
  108. g.Next() // Parse ahead.
  109. return g
  110. }
  111. // Next advances the iterator by one grapheme cluster and returns false if no
  112. // clusters are left. This function must be called before the first cluster is
  113. // accessed.
  114. func (g *Graphemes) Next() bool {
  115. g.start = g.end
  116. // The state transition gives us a boundary instruction BEFORE the next code
  117. // point so we always need to stay ahead by one code point.
  118. // Parse the next code point.
  119. for g.pos <= len(g.codePoints) {
  120. // GB2.
  121. if g.pos == len(g.codePoints) {
  122. g.end = g.pos
  123. g.pos++
  124. break
  125. }
  126. // Determine the property of the next character.
  127. nextProperty := property(g.codePoints[g.pos])
  128. g.pos++
  129. // Find the applicable transition.
  130. var boundary bool
  131. transition, ok := grTransitions[[2]int{g.state, nextProperty}]
  132. if ok {
  133. // We have a specific transition. We'll use it.
  134. g.state = transition[0]
  135. boundary = transition[1] == grBoundary
  136. } else {
  137. // No specific transition found. Try the less specific ones.
  138. transAnyProp, okAnyProp := grTransitions[[2]int{g.state, prAny}]
  139. transAnyState, okAnyState := grTransitions[[2]int{grAny, nextProperty}]
  140. if okAnyProp && okAnyState {
  141. // Both apply. We'll use a mix (see comments for grTransitions).
  142. g.state = transAnyState[0]
  143. boundary = transAnyState[1] == grBoundary
  144. if transAnyProp[2] < transAnyState[2] {
  145. g.state = transAnyProp[0]
  146. boundary = transAnyProp[1] == grBoundary
  147. }
  148. } else if okAnyProp {
  149. // We only have a specific state.
  150. g.state = transAnyProp[0]
  151. boundary = transAnyProp[1] == grBoundary
  152. // This branch will probably never be reached because okAnyState will
  153. // always be true given the current transition map. But we keep it here
  154. // for future modifications to the transition map where this may not be
  155. // true anymore.
  156. } else if okAnyState {
  157. // We only have a specific property.
  158. g.state = transAnyState[0]
  159. boundary = transAnyState[1] == grBoundary
  160. } else {
  161. // No known transition. GB999: Any x Any.
  162. g.state = grAny
  163. boundary = true
  164. }
  165. }
  166. // If we found a cluster boundary, let's stop here. The current cluster will
  167. // be the one that just ended.
  168. if g.pos-1 == 0 /* GB1 */ || boundary {
  169. g.end = g.pos - 1
  170. break
  171. }
  172. }
  173. return g.start != g.end
  174. }
  175. // Runes returns a slice of runes (code points) which corresponds to the current
  176. // grapheme cluster. If the iterator is already past the end or Next() has not
  177. // yet been called, nil is returned.
  178. func (g *Graphemes) Runes() []rune {
  179. if g.start == g.end {
  180. return nil
  181. }
  182. return g.codePoints[g.start:g.end]
  183. }
  184. // Str returns a substring of the original string which corresponds to the
  185. // current grapheme cluster. If the iterator is already past the end or Next()
  186. // has not yet been called, an empty string is returned.
  187. func (g *Graphemes) Str() string {
  188. if g.start == g.end {
  189. return ""
  190. }
  191. return string(g.codePoints[g.start:g.end])
  192. }
  193. // Bytes returns a byte slice which corresponds to the current grapheme cluster.
  194. // If the iterator is already past the end or Next() has not yet been called,
  195. // nil is returned.
  196. func (g *Graphemes) Bytes() []byte {
  197. if g.start == g.end {
  198. return nil
  199. }
  200. return []byte(string(g.codePoints[g.start:g.end]))
  201. }
  202. // Positions returns the interval of the current grapheme cluster as byte
  203. // positions into the original string. The first returned value "from" indexes
  204. // the first byte and the second returned value "to" indexes the first byte that
  205. // is not included anymore, i.e. str[from:to] is the current grapheme cluster of
  206. // the original string "str". If Next() has not yet been called, both values are
  207. // 0. If the iterator is already past the end, both values are 1.
  208. func (g *Graphemes) Positions() (int, int) {
  209. return g.indices[g.start], g.indices[g.end]
  210. }
  211. // Reset puts the iterator into its initial state such that the next call to
  212. // Next() sets it to the first grapheme cluster again.
  213. func (g *Graphemes) Reset() {
  214. g.start, g.end, g.pos, g.state = 0, 0, 0, grAny
  215. g.Next() // Parse ahead again.
  216. }
  217. // GraphemeClusterCount returns the number of user-perceived characters
  218. // (grapheme clusters) for the given string. To calculate this number, it
  219. // iterates through the string using the Graphemes iterator.
  220. func GraphemeClusterCount(s string) (n int) {
  221. g := NewGraphemes(s)
  222. for g.Next() {
  223. n++
  224. }
  225. return
  226. }