1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
13 use tables
::grapheme
::GraphemeCat
;
15 /// External iterator for grapheme clusters and byte offsets.
17 pub struct GraphemeIndices
<'a
> {
22 impl<'a
> Iterator
for GraphemeIndices
<'a
> {
23 type Item
= (usize, &'a
str);
26 fn next(&mut self) -> Option
<(usize, &'a
str)> {
27 self.iter
.next().map(|s
| (s
.as_ptr() as usize - self.start_offset
, s
))
31 fn size_hint(&self) -> (usize, Option
<usize>) {
36 impl<'a
> DoubleEndedIterator
for GraphemeIndices
<'a
> {
38 fn next_back(&mut self) -> Option
<(usize, &'a
str)> {
39 self.iter
.next_back().map(|s
| (s
.as_ptr() as usize - self.start_offset
, s
))
43 /// External iterator for a string's
44 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
46 pub struct Graphemes
<'a
> {
49 cat
: Option
<GraphemeCat
>,
50 catb
: Option
<GraphemeCat
>,
53 // state machine for cluster boundary rules
54 #[derive(PartialEq,Eq)]
64 impl<'a
> Iterator
for Graphemes
<'a
> {
68 fn size_hint(&self) -> (usize, Option
<usize>) {
69 let slen
= self.string
.len();
70 (cmp
::min(slen
, 1), Some(slen
))
74 fn next(&mut self) -> Option
<&'a
str> {
75 use self::GraphemeState
::*;
76 use tables
::grapheme
as gr
;
77 if self.string
.len() == 0 {
81 let mut take_curr
= true;
83 let mut state
= Start
;
84 let mut cat
= gr
::GC_Any
;
85 for (curr
, ch
) in self.string
.char_indices() {
88 // retrieve cached category, if any
89 // We do this because most of the time we would end up
90 // looking up each character twice.
91 cat
= match self.cat
{
92 None
=> gr
::grapheme_category(ch
),
93 _
=> self.cat
.take().unwrap()
97 gr
::GC_Extend
=> true,
98 gr
::GC_SpacingMark
if self.extended
=> true,
101 state
= FindExtend
; // rule GB9/GB9a
105 state
= match state
{
106 Start
if '
\r'
== ch
=> {
107 let slen
= self.string
.len();
109 if nidx
!= slen
&& self.string
[nidx
..].chars().next().unwrap() == '
\n'
{
110 idx
= nidx
; // rule GB3
115 gr
::GC_Control
=> break,
117 gr
::GC_LV
| gr
::GC_V
=> HangulLV
,
118 gr
::GC_LVT
| gr
::GC_T
=> HangulLVT
,
119 gr
::GC_Regional_Indicator
=> Regional
,
122 FindExtend
=> { // found non-extending when looking for extending
126 HangulL
=> match cat
{ // rule GB6: L x (L|V|LV|LVT)
127 gr
::GC_L
=> continue,
128 gr
::GC_LV
| gr
::GC_V
=> HangulLV
,
129 gr
::GC_LVT
=> HangulLVT
,
135 HangulLV
=> match cat
{ // rule GB7: (LV|V) x (V|T)
136 gr
::GC_V
=> continue,
137 gr
::GC_T
=> HangulLVT
,
143 HangulLVT
=> match cat
{ // rule GB8: (LVT|T) x T
144 gr
::GC_T
=> continue,
150 Regional
=> match cat
{ // rule GB8a
151 gr
::GC_Regional_Indicator
=> continue,
160 self.cat
= if take_curr
{
161 idx
= idx
+ self.string
[idx
..].chars().next().unwrap().len_utf8();
167 let retstr
= &self.string
[..idx
];
168 self.string
= &self.string
[idx
..];
173 impl<'a
> DoubleEndedIterator
for Graphemes
<'a
> {
175 fn next_back(&mut self) -> Option
<&'a
str> {
176 use self::GraphemeState
::*;
177 use tables
::grapheme
as gr
;
178 if self.string
.len() == 0 {
182 let mut take_curr
= true;
183 let mut idx
= self.string
.len();
184 let mut previdx
= idx
;
185 let mut state
= Start
;
186 let mut cat
= gr
::GC_Any
;
187 for (curr
, ch
) in self.string
.char_indices().rev() {
191 // cached category, if any
192 cat
= match self.catb
{
193 None
=> gr
::grapheme_category(ch
),
194 _
=> self.catb
.take().unwrap()
197 // a matching state machine that runs *backwards* across an input string
198 // note that this has some implications for the Hangul matching, since
199 // we now need to know what the rightward letter is:
201 // Right to left, we have:
205 // HangulL means the letter to the right is L
206 // HangulLV means the letter to the right is V
207 // HangulLVT means the letter to the right is T
208 state
= match state
{
209 Start
if '
\n'
== ch
=> {
210 if idx
> 0 && '
\r'
== self.string
[..idx
].chars().next_back().unwrap() {
211 idx
-= 1; // rule GB3
215 Start
| FindExtend
=> match cat
{
216 gr
::GC_Extend
=> FindExtend
,
217 gr
::GC_SpacingMark
if self.extended
=> FindExtend
,
218 gr
::GC_L
| gr
::GC_LV
| gr
::GC_LVT
=> HangulL
,
219 gr
::GC_V
=> HangulLV
,
220 gr
::GC_T
=> HangulLVT
,
221 gr
::GC_Regional_Indicator
=> Regional
,
223 take_curr
= Start
== state
;
228 HangulL
=> match cat
{ // char to right is an L
229 gr
::GC_L
=> continue, // L x L is the only legal match
235 HangulLV
=> match cat
{ // char to right is a V
236 gr
::GC_V
=> continue, // V x V, right char is still V
237 gr
::GC_L
| gr
::GC_LV
=> HangulL
, // (L|V) x V, right char is now L
243 HangulLVT
=> match cat
{ // char to right is a T
244 gr
::GC_T
=> continue, // T x T, right char is still T
245 gr
::GC_V
=> HangulLV
, // V x T, right char is now V
246 gr
::GC_LV
| gr
::GC_LVT
=> HangulL
, // (LV|LVT) x T, right char is now L
252 Regional
=> match cat
{ // rule GB8a
253 gr
::GC_Regional_Indicator
=> continue,
262 self.catb
= if take_curr
{
269 let retstr
= &self.string
[idx
..];
270 self.string
= &self.string
[..idx
];
276 pub fn new_graphemes
<'b
>(s
: &'b
str, is_extended
: bool
) -> Graphemes
<'b
> {
277 Graphemes { string: s, extended: is_extended, cat: None, catb: None }
281 pub fn new_grapheme_indices
<'b
>(s
: &'b
str, is_extended
: bool
) -> GraphemeIndices
<'b
> {
282 GraphemeIndices { start_offset: s.as_ptr() as usize, iter: new_graphemes(s, is_extended) }