20 #ifndef TESSERACT_TRAINING_VALIDATOR_H_
21 #define TESSERACT_TRAINING_VALIDATOR_H_
82 const std::vector<char32>& src,
83 std::vector<std::vector<char32>>*
dest);
151 const std::vector<char32>& src,
152 std::vector<std::vector<char32>>*
dest);
155 std::vector<std::vector<char32>>*
dest);
160 const std::vector<char32>& utf32);
static ViramaScript MostFrequentViramaScript(const std::vector< char32 > &utf32)
static const char32 kSinhalaVirama
static const char32 kZeroWidthNonJoiner
static const char32 kKhmerVirama
virtual CharClass UnicodeToCharClass(char32 ch) const =0
static const char32 kJavaneseVirama
std::vector< char32 > output_
static const char32 kInvalid
static const char32 kRightToLeftMark
static bool IsVedicAccent(char32 unicode)
bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
static std::unique_ptr< Validator > ScriptValidator(ViramaScript script, bool report_errors)
bool UseMultiCode(unsigned length)
void MultiCodePart(unsigned length)
static const char32 kMaxViramaScriptUnicode
static const char32 kLeftToRightMark
static bool IsVirama(char32 unicode)
static const int kIndicCodePageSize
std::pair< CharClass, char32 > IndicPair
static const char32 kZeroWidthSpace
static bool IsZeroWidthMark(char32 ch)
bool IsSubscriptScript() const
std::vector< IndicPair > codes_
void ComputeClassCodes(const std::vector< char32 > &text)
static const char32 kZeroWidthJoiner
static const char32 kMaxJavaneseUnicode
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, bool report_errors, const std::vector< char32 > &src, std::vector< std::vector< char32 >> *dest)
static const char32 kMaxSinhalaUnicode
Validator(ViramaScript script, bool report_errors)
static const char32 kMyanmarVirama
static const char32 kMinIndicUnicode
std::vector< std::vector< char32 > > parts_
void MoveResultsToDest(GraphemeNormMode g_mode, std::vector< std::vector< char32 >> *dest)
virtual bool ConsumeGraphemeIfValid()=0