Unicode library for C++ by Ross Smith
#include "unicorn/segment.hpp"This module contains classes and functions for breaking text up into characters, words, sentences, lines, and paragraphs. Most of the rules followed here are defined in Unicode Standard Annex 29: Unicode Text Segmentation.
All of the iterators defined here dereference to a substring represented by a
pair of UTF iterators, bracketing the text segment of
interest. As usual, the u_str() function can be used to copy the actual
substring if this is needed.
template <typename C> class GraphemeIteratorusing GraphemeIterator::utf_iterator = UtfIterator<C>using GraphemeIterator::difference_type = ptrdiff_tusing GraphemeIterator::iterator_category = std::forward_iterator_tagusing GraphemeIterator::value_type = Irange<utf_iterator>using GraphemeIterator::pointer = const value_type*using GraphemeIterator::reference = const value_type&GraphemeIterator::GraphemeIterator()template <typename C> Irange<GraphemeIterator<C>> grapheme_range(const UtfIterator<C>& i, const UtfIterator<C>& j)template <typename C> Irange<GraphemeIterator<C>> grapheme_range(const Irange<UtfIterator<C>>& source)template <typename C> Irange<GraphemeIterator<C>> grapheme_range(const basic_string<C>& source)A forward iterator over the grapheme clusters (user-perceived characters) in a Unicode string.
template <typename C> class WordIteratorusing WordIterator::utf_iterator = UtfIterator<C>using WordIterator::difference_type = ptrdiff_tusing WordIterator::iterator_category = std::forward_iterator_tagusing WordIterator::value_type = Irange<utf_iterator>using WordIterator::pointer = const value_type*using WordIterator::reference = const value_type&WordIterator::WordIterator()template <typename C> Irange<WordIterator<C>> word_range(const UtfIterator<C>& i, const UtfIterator<C>& j, uint32_t flags = 0)template <typename C> Irange<WordIterator<C>> word_range(const Irange<UtfIterator<C>>& source, uint32_t flags = 0)template <typename C> Irange<WordIterator<C>> word_range(const basic_string<C>& source, uint32_t flags = 0)A forward iterator over the words in a Unicode string. By default, all segments identified as "words" by the UAX29 algorithm are returned; this will include whitespace between words, punctuation marks, etc. Flags can be used to select only words containing at least one non-whitespace character, or only words containing at least one alphanumeric character.
| Flag | Description |
|---|---|
unicode_words |
Report all UAX29 words (default) |
graphic_words |
Report only words containing a non-whitespace character |
alpha_words |
Report only words containing an alphanumeric character |
template <typename C> class SentenceIteratorusing SentenceIterator::utf_iterator = UtfIterator<C>using SentenceIterator::difference_type = ptrdiff_tusing SentenceIterator::iterator_category = std::forward_iterator_tagusing SentenceIterator::value_type = Irange<utf_iterator>using SentenceIterator::pointer = const value_type*using SentenceIterator::reference = const value_type&SentenceIterator::SentenceIterator()template <typename C> Irange<SentenceIterator<C>> sentence_range(const UtfIterator<C>& i, const UtfIterator<C>& j)template <typename C> Irange<SentenceIterator<C>> sentence_range(const Irange<UtfIterator<C>>& source)template <typename C> Irange<SentenceIterator<C>> sentence_range(const basic_string<C>& source)A forward iterator over the sentences in a Unicode string (as defined by UAX29).
template <typename C> class LineIteratorusing LineIterator::utf_iterator = UtfIterator<C>using LineIterator::difference_type = ptrdiff_tusing LineIterator::iterator_category = std::forward_iterator_tagusing LineIterator::value_type = Irange<utf_iterator>using LineIterator::pointer = const value_type*using LineIterator::reference = const value_type&LineIterator::LineIterator()template <typename C> Irange<LineIterator<C>> line_range(const UtfIterator<C>& i, const UtfIterator<C>& j, uint32_t flags = 0)template <typename C> Irange<LineIterator<C>> line_range(const Irange<UtfIterator<C>>& source, uint32_t flags = 0)template <typename C> Irange<LineIterator<C>> line_range(const basic_string<C>& source, uint32_t flags = 0)A forward iterator over the lines in a Unicode string. Lines are ended by any
character with the line break property. Multiple consecutive line break
characters are treated as separate lines; except that CR+LF is treated as a
single line break. By default, the segment identified by the dereferenced
iterator includes the terminating line break; if the strip_breaks flag is
set, the line break is excluded from the segment.
| Flag | Description |
|---|---|
keep_breaks |
Include line terminators in reported segments (default) |
strip_breaks |
Do not include line terminators |
template <typename C> class ParagraphIteratorusing ParagraphIterator::utf_iterator = UtfIterator<C>using ParagraphIterator::difference_type = ptrdiff_tusing ParagraphIterator::iterator_category = std::forward_iterator_tagusing ParagraphIterator::value_type = Irange<utf_iterator>using ParagraphIterator::pointer = const value_type*using ParagraphIterator::reference = const value_type&ParagraphIterator::ParagraphIterator()template <typename C> Irange<ParagraphIterator<C>> paragraph_range(const UtfIterator<C>& i, const UtfIterator<C>& j, uint32_t flags = 0)template <typename C> Irange<ParagraphIterator<C>> paragraph_range(const Irange<UtfIterator<C>>& source, uint32_t flags = 0)template <typename C> Irange<ParagraphIterator<C>> paragraph_range(const basic_string<C>& source, uint32_t flags = 0)A forward iterator over the paragraphs in a Unicode string. The flags passed
to the constructor determine how paragraphs are identified. By default, any
sequence of two or more line breaks ends a paragraph (as usual, CR+LF counts
as a single line break); the line_paras flag causes every line break to be
interpreted as a paragraph break, while unicode_paras restricts paragraph
breaks to the Unicode paragraph separator character (U+2029). The
strip_breaks flag works the same way as in LineIterator, skipping the
paragraph delimiters.
| Flag | Description |
|---|---|
multiline_paras |
Divide into paragraphs using multiple line breaks (default) |
line_paras |
Divide into paragraphs using any line break |
unicode_paras |
Divide into paragraphs using only Paragraph Separator |
keep_breaks |
Include paragraph terminators in reported segments (default) |
strip_breaks |
Do not include paragraph terminators |