4 #include <unordered_map>
9 #include "unicode/uchar.h"
10 #include "unicode/uscript.h"
42 std::vector<std::vector<char32>>*
dest) {
44 std::vector<std::vector<char32>> graphemes;
59 std::unique_ptr<Validator> validator(
61 for (
const auto& grapheme : graphemes) {
62 if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme,
dest)) {
75 return std::unique_ptr<Validator>(
78 return std::unique_ptr<Validator>(
81 return std::unique_ptr<Validator>(
84 return std::unique_ptr<Validator>(
87 return std::unique_ptr<Validator>(
99 std::vector<std::vector<char32>>*
dest) {
115 std::vector<std::vector<char32>>*
dest) {
126 dest->push_back(std::vector<char32>());
135 static bool CmpPairSecond(
const std::pair<int, int>& p1,
136 const std::pair<int, int>& p2) {
137 return p1.second < p2.second;
144 const std::vector<char32>& utf32) {
145 std::unordered_map<int, int> histogram;
152 UScriptCode script_code = uscript_getScript(ch, err);
154 script_code != USCRIPT_COMMON) ||
155 script_code == USCRIPT_MYANMAR) {
156 if (script_code == USCRIPT_MYANMAR)
161 if (!histogram.empty()) {
163 std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)
182 (unicode & 0x7f) == 0x4d) ||
192 return (0x1cd0 <= unicode && unicode < 0x1d00) ||
193 (0xa8e0 <= unicode && unicode <= 0xa8f7) ||
194 (0x951 <= unicode && unicode <= 0x954);
207 codes_.reserve(text.size());
static ViramaScript MostFrequentViramaScript(const std::vector< char32 > &utf32)
static const char32 kSinhalaVirama
static const char32 kZeroWidthNonJoiner
static const char32 kKhmerVirama
virtual CharClass UnicodeToCharClass(char32 ch) const =0
static const char32 kJavaneseVirama
std::vector< char32 > output_
static const char32 kInvalid
static const char32 kRightToLeftMark
static bool IsVedicAccent(char32 unicode)
bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
static std::unique_ptr< Validator > ScriptValidator(ViramaScript script, bool report_errors)
static const char32 kLeftToRightMark
static bool IsVirama(char32 unicode)
static const int kIndicCodePageSize
static const char32 kZeroWidthSpace
bool IsSubscriptScript() const
std::vector< IndicPair > codes_
void ComputeClassCodes(const std::vector< char32 > &text)
static const char32 kZeroWidthJoiner
static const char32 kMaxJavaneseUnicode
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
static const char32 kMaxSinhalaUnicode
static const char32 kMyanmarVirama
static const char32 kMinIndicUnicode
std::vector< std::vector< char32 > > parts_
void MoveResultsToDest(GraphemeNormMode g_mode, std::vector< std::vector< char32 >> *dest)
virtual bool ConsumeGraphemeIfValid()=0