#include <validator.h>
|
enum class | CharClass {
kConsonant = 'C'
, kVowel = 'V'
, kVirama = 'H'
, kMatra = 'M'
,
kMatraPiece = 'P'
, kVowelModifier = 'D'
, kZeroWidthNonJoiner = 'z'
, kZeroWidthJoiner = 'Z'
,
kVedicMark = 'v'
, kNukta = 'N'
, kRobat = 'R'
, kOther = 'O'
,
kWhitespace = ' '
, kCombiner = 'c'
} |
|
using | IndicPair = std::pair< CharClass, char32 > |
|
Definition at line 71 of file validator.h.
◆ IndicPair
◆ CharClass
Enumerator |
---|
kConsonant | |
kVowel | |
kVirama | |
kMatra | |
kMatraPiece | |
kVowelModifier | |
kZeroWidthNonJoiner | |
kZeroWidthJoiner | |
kVedicMark | |
kNukta | |
kRobat | |
kOther | |
kWhitespace | |
kCombiner | |
Definition at line 111 of file validator.h.
122 kVowelModifier =
'D',
static const char32 kZeroWidthNonJoiner
static const char32 kZeroWidthJoiner
◆ ~Validator()
tesseract::Validator::~Validator |
( |
| ) |
|
|
virtualdefault |
◆ Validator()
tesseract::Validator::Validator |
( |
ViramaScript |
script, |
|
|
bool |
report_errors |
|
) |
| |
|
inlineprotected |
◆ Clear()
void tesseract::Validator::Clear |
( |
| ) |
|
|
protected |
Definition at line 214 of file validator.cpp.
std::vector< char32 > output_
std::vector< IndicPair > codes_
std::vector< std::vector< char32 > > parts_
◆ CodeOnlyToOutput()
bool tesseract::Validator::CodeOnlyToOutput |
( |
| ) |
|
|
inlineprotected |
◆ ComputeClassCodes()
void tesseract::Validator::ComputeClassCodes |
( |
const std::vector< char32 > & |
text | ) |
|
|
protected |
Definition at line 206 of file validator.cpp.
207 codes_.reserve(text.size());
virtual CharClass UnicodeToCharClass(char32 ch) const =0
◆ ConsumeGraphemeIfValid()
virtual bool tesseract::Validator::ConsumeGraphemeIfValid |
( |
| ) |
|
|
protectedpure virtual |
◆ IsSubscriptScript()
bool tesseract::Validator::IsSubscriptScript |
( |
| ) |
const |
|
protected |
◆ IsVedicAccent()
bool tesseract::Validator::IsVedicAccent |
( |
char32 |
unicode | ) |
|
|
staticprotected |
Definition at line 191 of file validator.cpp.
192 return (0x1cd0 <= unicode && unicode < 0x1d00) ||
193 (0xa8e0 <= unicode && unicode <= 0xa8f7) ||
194 (0x951 <= unicode && unicode <= 0x954);
◆ IsVirama()
bool tesseract::Validator::IsVirama |
( |
char32 |
unicode | ) |
|
|
staticprotected |
Definition at line 180 of file validator.cpp.
182 (unicode & 0x7f) == 0x4d) ||
static const char32 kSinhalaVirama
static const char32 kKhmerVirama
static const char32 kJavaneseVirama
static const char32 kMaxSinhalaUnicode
static const char32 kMyanmarVirama
static const char32 kMinIndicUnicode
◆ IsZeroWidthMark()
static bool tesseract::Validator::IsZeroWidthMark |
( |
char32 |
ch | ) |
|
|
inlinestatic |
Definition at line 87 of file validator.h.
static const char32 kInvalid
static const char32 kRightToLeftMark
static const char32 kLeftToRightMark
static const char32 kZeroWidthSpace
◆ MostFrequentViramaScript()
ViramaScript tesseract::Validator::MostFrequentViramaScript |
( |
const std::vector< char32 > & |
utf32 | ) |
|
|
staticprotected |
Definition at line 143 of file validator.cpp.
145 std::unordered_map<int, int> histogram;
152 UScriptCode script_code = uscript_getScript(ch, err);
154 script_code != USCRIPT_COMMON) ||
155 script_code == USCRIPT_MYANMAR) {
156 if (script_code == USCRIPT_MYANMAR)
161 if (!histogram.empty()) {
163 std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)
static const int kIndicCodePageSize
static const char32 kMaxJavaneseUnicode
◆ MoveResultsToDest()
void tesseract::Validator::MoveResultsToDest |
( |
GraphemeNormMode |
g_mode, |
|
|
std::vector< std::vector< char32 >> * |
dest |
|
) |
| |
|
protected |
◆ MultiCodePart()
void tesseract::Validator::MultiCodePart |
( |
unsigned |
length | ) |
|
|
inlineprotected |
◆ ScriptValidator()
std::unique_ptr< Validator > tesseract::Validator::ScriptValidator |
( |
ViramaScript |
script, |
|
|
bool |
report_errors |
|
) |
| |
|
staticprotected |
Definition at line 71 of file validator.cpp.
75 return std::unique_ptr<Validator>(
76 new ValidateGrapheme(script, report_errors));
78 return std::unique_ptr<Validator>(
79 new ValidateJavanese(script, report_errors));
81 return std::unique_ptr<Validator>(
82 new ValidateMyanmar(script, report_errors));
84 return std::unique_ptr<Validator>(
85 new ValidateKhmer(script, report_errors));
87 return std::unique_ptr<Validator>(
88 new ValidateIndic(script, report_errors));
◆ UnicodeToCharClass()
virtual CharClass tesseract::Validator::UnicodeToCharClass |
( |
char32 |
ch | ) |
const |
|
protectedpure virtual |
◆ UseMultiCode()
bool tesseract::Validator::UseMultiCode |
( |
unsigned |
length | ) |
|
|
inlineprotected |
Definition at line 195 of file validator.h.
void MultiCodePart(unsigned length)
◆ ValidateCleanAndSegment()
bool tesseract::Validator::ValidateCleanAndSegment |
( |
GraphemeNormMode |
g_mode, |
|
|
bool |
report_errors, |
|
|
const std::vector< char32 > & |
src, |
|
|
std::vector< std::vector< char32 >> * |
dest |
|
) |
| |
|
static |
Definition at line 40 of file validator.cpp.
44 std::vector<std::vector<char32>> graphemes;
55 success = g_validator.ValidateCleanAndSegmentInternal(g_mode, src,
dest);
57 success = g_validator.ValidateCleanAndSegmentInternal(
59 std::unique_ptr<Validator> validator(
61 for (
const auto& grapheme : graphemes) {
62 if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme,
dest)) {
static ViramaScript MostFrequentViramaScript(const std::vector< char32 > &utf32)
static std::unique_ptr< Validator > ScriptValidator(ViramaScript script, bool report_errors)
◆ ValidateCleanAndSegmentInternal()
bool tesseract::Validator::ValidateCleanAndSegmentInternal |
( |
GraphemeNormMode |
g_mode, |
|
|
const std::vector< char32 > & |
src, |
|
|
std::vector< std::vector< char32 >> * |
dest |
|
) |
| |
|
protected |
Definition at line 97 of file validator.cpp.
void ComputeClassCodes(const std::vector< char32 > &text)
void MoveResultsToDest(GraphemeNormMode g_mode, std::vector< std::vector< char32 >> *dest)
virtual bool ConsumeGraphemeIfValid()=0
◆ codes_
std::vector<IndicPair> tesseract::Validator::codes_ |
|
protected |
◆ codes_used_
unsigned tesseract::Validator::codes_used_ |
|
protected |
◆ kIndicCodePageSize
const int tesseract::Validator::kIndicCodePageSize = 128 |
|
staticprotected |
◆ kInvalid
const char32 tesseract::Validator::kInvalid = 0xfffd |
|
static |
◆ kJavaneseVirama
const char32 tesseract::Validator::kJavaneseVirama = 0xa9c0 |
|
staticprotected |
◆ kKhmerVirama
const char32 tesseract::Validator::kKhmerVirama = 0x17d2 |
|
staticprotected |
◆ kLeftToRightMark
const char32 tesseract::Validator::kLeftToRightMark = 0x200E |
|
static |
◆ kMaxJavaneseUnicode
const char32 tesseract::Validator::kMaxJavaneseUnicode = 0xa9df |
|
staticprotected |
◆ kMaxSinhalaUnicode
const char32 tesseract::Validator::kMaxSinhalaUnicode = 0xdff |
|
staticprotected |
◆ kMaxViramaScriptUnicode
const char32 tesseract::Validator::kMaxViramaScriptUnicode = 0x17ff |
|
staticprotected |
◆ kMinIndicUnicode
const char32 tesseract::Validator::kMinIndicUnicode = 0x900 |
|
staticprotected |
◆ kMyanmarVirama
const char32 tesseract::Validator::kMyanmarVirama = 0x1039 |
|
staticprotected |
◆ kRightToLeftMark
const char32 tesseract::Validator::kRightToLeftMark = 0x200F |
|
static |
◆ kSinhalaVirama
const char32 tesseract::Validator::kSinhalaVirama = 0xdca |
|
staticprotected |
◆ kZeroWidthJoiner
const char32 tesseract::Validator::kZeroWidthJoiner = 0x200D |
|
static |
◆ kZeroWidthNonJoiner
const char32 tesseract::Validator::kZeroWidthNonJoiner = 0x200C |
|
static |
◆ kZeroWidthSpace
const char32 tesseract::Validator::kZeroWidthSpace = 0x200B |
|
static |
◆ output_
std::vector<char32> tesseract::Validator::output_ |
|
protected |
◆ output_used_
unsigned tesseract::Validator::output_used_ |
|
protected |
◆ parts_
std::vector<std::vector<char32> > tesseract::Validator::parts_ |
|
protected |
◆ report_errors_
bool tesseract::Validator::report_errors_ |
|
protected |
◆ script_
The documentation for this class was generated from the following files: