39 static const char kSeparator =
'|';
41 static const char kNaturalFlag =
'n';
43 static const int ISALPHA_MASK = 0x1;
44 static const int ISLOWER_MASK = 0x2;
45 static const int ISUPPER_MASK = 0x4;
46 static const int ISDIGIT_MASK = 0x8;
47 static const int ISPUNCTUATION_MASK = 0x10;
52 static const int kMeanlineThreshold = 220;
75 const char* UNICHARSET::kCleanupMaps[][2] = {
88 const char* UNICHARSET::null_script =
"NULL";
90 UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() {
95 void UNICHARSET::UNICHAR_PROPERTIES::Init() {
100 ispunctuation =
false;
114 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() {
116 max_bottom = UINT8_MAX;
128 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() {
129 min_bottom = UINT8_MAX;
143 bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty()
const {
144 return width == 0.0f || advance == 0.0f;
148 void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom(
149 const UNICHAR_PROPERTIES& src) {
150 UpdateRange(src.min_bottom, &min_bottom, &max_bottom);
151 UpdateRange(src.max_bottom, &min_bottom, &max_bottom);
154 if (src.width_sd > width_sd) {
156 width_sd = src.width_sd;
158 if (src.bearing_sd > bearing_sd) {
159 bearing = src.bearing;
160 bearing_sd = src.bearing_sd;
162 if (src.advance_sd > advance_sd) {
163 advance = src.advance;
164 advance_sd = src.advance_sd;
169 void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(
const UNICHAR_PROPERTIES& src) {
173 fragment = saved_fragment;
181 script_table(nullptr),
182 script_table_size_used(0) {
196 if (unichars_number > size_reserved) {
197 auto* unichars_new =
new UNICHAR_SLOT[unichars_number];
198 for (
int i = 0; i < size_used; ++i)
199 unichars_new[i] = unichars[i];
200 for (
int j = size_used; j < unichars_number; ++j) {
201 unichars_new[j].properties.script_id =
add_script(null_script);
204 unichars = unichars_new;
205 size_reserved = unichars_number;
211 std::string cleaned =
212 old_style_included_ ? unichar_repr :
CleanupString(unichar_repr);
213 return ids.
contains(cleaned.data(), cleaned.size())
215 : INVALID_UNICHAR_ID;
221 std::string cleaned(unichar_repr, length);
222 if (!old_style_included_) cleaned =
CleanupString(unichar_repr, length);
223 return ids.
contains(cleaned.data(), cleaned.size())
225 : INVALID_UNICHAR_ID;
237 if (encoding.
empty() || encoding[0] == INVALID_UNICHAR_ID)
return 0;
245 int *first_bad_position)
const {
247 return encode_string(str,
true, &encoding,
nullptr, first_bad_position);
262 int* encoded_length)
const {
267 int str_length = strlen(str);
270 while (str_pos < str_length) {
271 encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
272 &str_pos, encoding, &best_lengths);
273 if (str_pos < str_length) {
276 if (give_up_on_failure)
break;
277 int step = UNICHAR::utf8_step(str + str_pos);
282 working_encoding = *encoding;
283 working_lengths = best_lengths;
286 if (lengths !=
nullptr) *lengths = best_lengths;
287 if (encoded_length !=
nullptr) *encoded_length = str_pos;
292 if (
id == INVALID_UNICHAR_ID) {
293 return INVALID_UNICHAR;
296 return unichars[id].representation;
300 if (
id == INVALID_UNICHAR_ID) {
301 return INVALID_UNICHAR;
314 return unichars[id].representation;
324 for (
int i = 0; str[i] !=
'\0'; i +=
step) {
325 char hex[
sizeof(
int) * 2 + 1];
326 step = UNICHAR::utf8_step(str + i);
329 sprintf(hex,
"%x", str[i]);
374 unichars[unichar_id].properties.normed_ids.truncate(0);
376 unichars[unichar_id].properties.normed_ids.push_back(
UNICHAR_SPACE);
377 }
else if (!
encode_string(unichars[unichar_id].properties.normed.string(),
378 true, &unichars[unichar_id].properties.normed_ids,
380 unichars[unichar_id].properties.normed_ids.truncate(0);
381 unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
391 return (uni >= 0xE000 && uni <= 0xF8FF);
397 for (
int id = 0;
id < size_used; ++id) {
398 unichars[id].properties.SetRangesEmpty();
407 for (
int ch = start_index; ch < size_used; ++ch) {
409 UNICHAR_PROPERTIES properties;
410 if (src.GetStrProperties(utf8, &properties)) {
414 const char* other_case = src.
id_to_unichar(properties.other_case);
418 properties.other_case = ch;
420 const char* mirror_str = src.
id_to_unichar(properties.mirror);
424 properties.mirror = ch;
426 unichars[ch].properties.CopyFrom(properties);
436 for (
int ch = 0; ch < size_used; ++ch) {
438 UNICHAR_PROPERTIES properties;
439 if (src.GetStrProperties(utf8, &properties)) {
441 unichars[ch].properties.ExpandRangesFrom(properties);
450 for (
int ch = 0; ch < src.size_used; ++ch) {
451 const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
454 unichars[ch].properties.ExpandRangesFrom(src_props);
465 int initial_used = size_used;
466 for (
int ch = 0; ch < src.size_used; ++ch) {
467 const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
473 unichars[id].properties.ExpandRangesFrom(src_props);
476 unichars[id].properties.SetRangesEmpty();
487 int overlap = std::min(unichars[id1].properties.max_top,
488 unichars[id2].properties.max_top) -
489 std::max(unichars[id1].properties.min_top,
490 unichars[id2].properties.min_top);
505 int* best_total_length,
508 if (str_index > *best_total_length) {
510 *best_total_length = str_index;
511 *best_encoding = *encoding;
512 if (best_lengths !=
nullptr)
513 *best_lengths = *lengths;
515 if (str_index == str_length)
return;
516 int encoding_index = encoding->
size();
518 int length = ids.
minmatch(str + str_index);
519 if (length == 0 || str_index + length > str_length)
return;
521 if (ids.
contains(str + str_index, length)) {
526 encode_string(str, str_index + length, str_length, encoding, lengths,
527 best_total_length, best_encoding, best_lengths);
528 if (*best_total_length == str_length)
534 int step = UNICHAR::utf8_step(str + str_index + length);
537 }
while (length <=
UNICHAR_LEN && str_index + length <= str_length);
545 bool UNICHARSET::GetStrProperties(
const char* utf8_str,
546 UNICHAR_PROPERTIES* props)
const {
548 props->SetRangesEmpty();
549 int total_unicodes = 0;
551 if (!
encode_string(utf8_str,
true, &encoding,
nullptr,
nullptr))
553 for (
int i = 0; i < encoding.
size(); ++i) {
554 int id = encoding[i];
555 const UNICHAR_PROPERTIES& src_props = unichars[id].properties;
557 if (src_props.isalpha) props->isalpha =
true;
558 if (src_props.islower) props->islower =
true;
559 if (src_props.isupper) props->isupper =
true;
560 if (src_props.isdigit) props->isdigit =
true;
561 if (src_props.ispunctuation) props->ispunctuation =
true;
562 if (src_props.isngram) props->isngram =
true;
563 if (src_props.enabled) props->enabled =
true;
565 UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom);
566 UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom);
567 UpdateRange(src_props.min_top, &props->min_top, &props->max_top);
568 UpdateRange(src_props.max_top, &props->min_top, &props->max_top);
569 float bearing = props->advance + src_props.bearing;
570 if (total_unicodes == 0 || bearing < props->bearing) {
571 props->bearing = bearing;
572 props->bearing_sd = props->advance_sd + src_props.bearing_sd;
574 props->advance += src_props.advance;
575 props->advance_sd += src_props.advance_sd;
577 props->width = src_props.width;
578 props->width_sd = src_props.width_sd;
581 if (total_unicodes == 0) {
582 props->script_id = src_props.script_id;
583 props->other_case = src_props.other_case;
584 props->mirror = src_props.mirror;
585 props->direction = src_props.direction;
589 props->normed += src_props.normed;
592 if (total_unicodes > 1) {
594 props->width = props->advance - props->bearing;
595 props->width_sd = props->advance_sd + props->bearing_sd;
597 return total_unicodes > 0;
603 unsigned int properties = 0;
605 properties |= ISALPHA_MASK;
607 properties |= ISLOWER_MASK;
609 properties |= ISUPPER_MASK;
611 properties |= ISDIGIT_MASK;
613 properties |= ISPUNCTUATION_MASK;
629 std::string cleaned =
630 old_style_included_ ? unichar_repr :
CleanupString(unichar_repr);
631 if (!cleaned.empty() && !ids.
contains(cleaned.data(), cleaned.size())) {
632 const char* str = cleaned.c_str();
634 if (!old_style_included_ &&
637 if (size_used == size_reserved) {
646 fprintf(stderr,
"Utf8 buffer too big, size>%d for %s\n",
UNICHAR_LEN,
650 unichars[size_used].representation[index++] = *str++;
651 }
while (*str !=
'\0');
652 unichars[size_used].representation[index] =
'\0';
660 this->unichars[size_used].properties.fragment = frag;
662 this->unichars[size_used].properties.script_id =
665 this->unichars[size_used].properties.enabled =
true;
666 ids.
insert(unichars[size_used].representation, size_used);
672 std::string cleaned =
673 old_style_included_ ? unichar_repr :
CleanupString(unichar_repr);
674 return ids.
contains(cleaned.data(), cleaned.size());
682 std::string cleaned(unichar_repr, length);
683 if (!old_style_included_) cleaned =
CleanupString(unichar_repr, length);
684 return ids.
contains(cleaned.data(), cleaned.size());
688 const char*
const unichar_repr)
const {
689 return strcmp(this->
id_to_unichar(unichar_id), unichar_repr) == 0;
693 const int kFileBufSize = 1024;
694 char buffer[kFileBufSize + 1];
695 snprintf(buffer, kFileBufSize,
"%d\n", this->
size());
698 int min_bottom, max_bottom, min_top, max_top;
700 float width, width_sd;
702 float bearing, bearing_sd;
704 float advance, advance_sd;
708 snprintf(buffer, kFileBufSize,
"%s %x %s %d\n",
"NULL", properties,
713 std::ostringstream stream;
714 stream.imbue(std::locale::classic());
715 stream << this->
id_to_unichar(
id) <<
' ' << properties <<
' ' <<
716 min_bottom <<
',' << max_bottom <<
',' <<
717 min_top <<
',' << max_top <<
',' <<
718 width <<
',' << width_sd <<
',' <<
719 bearing <<
',' << bearing_sd <<
',' <<
720 advance <<
',' << advance_sd <<
' ' <<
727 *str += stream.str().
c_str();
737 : memory_(memory), fgets_ptr_(memory), mem_size_(mem_size) { }
739 char *
fgets(
char *orig_dst,
int size) {
740 const char *src_end = memory_ + mem_size_;
741 char *dst_end = orig_dst + size - 1;
743 return fgets_ptr_ < src_end ? orig_dst :
nullptr;
746 char *dst = orig_dst;
748 while (fgets_ptr_ < src_end && dst < dst_end && ch !=
'\n') {
749 ch = *dst++ = *fgets_ptr_++;
752 return (dst == orig_dst) ? nullptr : orig_dst;
757 const char *fgets_ptr_;
762 const char *memory,
int mem_size,
bool skip_fragments) {
766 bool success = load_via_fgets(fgets_cb, skip_fragments);
775 return ::fgets(dst, size, fp_);
785 bool success = load_via_fgets(fgets_cb, skip_fragments);
793 bool success = load_via_fgets(fgets_cb, skip_fragments);
798 bool UNICHARSET::load_via_fgets(
800 bool skip_fragments) {
805 if (fgets_cb->
Run(buffer,
sizeof(buffer)) ==
nullptr ||
806 sscanf(buffer,
"%d", &unicharset_size) != 1) {
809 this->
reserve(unicharset_size);
810 for (
UNICHAR_ID id = 0;
id < unicharset_size; ++id) {
812 unsigned int properties;
815 strncpy(script, null_script,
sizeof(script) - 1);
817 int max_bottom = UINT8_MAX;
819 int max_top = UINT8_MAX;
821 float width_sd = 0.0f;
822 float bearing = 0.0f;
823 float bearing_sd = 0.0f;
824 float advance = 0.0f;
825 float advance_sd = 0.0f;
831 if (fgets_cb->
Run(buffer, sizeof (buffer)) ==
nullptr) {
836 std::istringstream stream(buffer);
837 stream.imbue(std::locale::classic());
840 stream >> std::setw(255) >> unichar >> std::hex >> properties >> std::dec;
843 fprintf(stderr,
"%s:%u failed\n", __FILE__, __LINE__);
846 auto position = stream.tellg();
847 stream.seekg(position);
848 char c1, c2, c3, c4, c5, c6, c7, c8, c9;
849 stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >> c4 >>
850 width >> c5 >>width_sd >> c6 >> bearing >> c7 >> bearing_sd >> c8 >>
851 advance >> c9 >> advance_sd >> std::setw(63) >> script >>
852 other_case >> direction >> mirror >> std::setw(63) >> normed;
853 if (stream.fail() || c1 !=
',' || c2 !=
',' || c3 !=
',' || c4 !=
',' ||
854 c5 !=
',' || c6 !=
',' || c7 !=
',' || c8 !=
',' || c9 !=
',') {
856 stream.seekg(position);
857 stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >> c4 >>
858 width >> c5 >>width_sd >> c6 >> bearing >> c7 >> bearing_sd >> c8 >>
859 advance >> c9 >> advance_sd >> std::setw(63) >> script >>
860 other_case >> direction >> mirror;
861 if (stream.fail() || c1 !=
',' || c2 !=
',' || c3 !=
',' || c4 !=
',' ||
862 c5 !=
',' || c6 !=
',' || c7 !=
',' || c8 !=
',' || c9 !=
',') {
864 stream.seekg(position);
865 stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >>
866 std::setw(63) >> script >> other_case >> direction >> mirror;
867 if (stream.fail() || c1 !=
',' || c2 !=
',' || c3 !=
',') {
869 stream.seekg(position);
870 stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >>
871 std::setw(63) >> script >> other_case;
872 if (stream.fail() || c1 !=
',' || c2 !=
',' || c3 !=
',') {
874 stream.seekg(position);
875 stream >> std::setw(63) >> script >> other_case;
878 stream.seekg(position);
879 stream >> std::setw(63) >> script;
896 if (strcmp(unichar,
"NULL") == 0)
908 this->unichars[id].properties.enabled =
true;
909 this->
set_top_bottom(
id, min_bottom, max_bottom, min_top, max_top);
915 id, (other_case < unicharset_size) ? other_case :
id);
916 this->
set_mirror(
id, (mirror < unicharset_size) ? mirror :
id);
917 this->
set_normed(
id, normed[0] !=
'\0' ? normed : unichar);
929 int net_case_alphas = 0;
930 int x_height_alphas = 0;
931 int cap_height_alphas = 0;
932 top_bottom_set_ =
false;
933 for (
UNICHAR_ID id = 0;
id < size_used; ++id) {
935 int max_bottom = UINT8_MAX;
937 int max_top = UINT8_MAX;
940 top_bottom_set_ =
true;
946 if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
948 else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
954 script_has_upper_lower_ = net_case_alphas > 0;
955 script_has_xheight_ = script_has_upper_lower_ ||
973 int* script_counts =
new int[script_table_size_used];
974 memset(script_counts, 0,
sizeof(*script_counts) * script_table_size_used);
975 for (
int id = 0;
id < size_used; ++id) {
981 for (
int s = 1; s < script_table_size_used; ++s) {
982 if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
985 delete [] script_counts;
995 for (
int id = 0;
id < size_used; ++id) {
1002 return rtl_count > ltr_count;
1010 const char* whitelist,
1011 const char* unblacklist) {
1012 bool def_enabled = whitelist ==
nullptr || whitelist[0] ==
'\0';
1014 for (
int ch = 0; ch < size_used; ++ch)
1015 unichars[ch].properties.enabled = def_enabled;
1019 encode_string(whitelist,
false, &encoding,
nullptr,
nullptr);
1020 for (
int i = 0; i < encoding.
size(); ++i) {
1021 if (encoding[i] != INVALID_UNICHAR_ID)
1022 unichars[encoding[i]].properties.enabled =
true;
1025 if (blacklist !=
nullptr && blacklist[0] !=
'\0') {
1028 encode_string(blacklist,
false, &encoding,
nullptr,
nullptr);
1029 for (
int i = 0; i < encoding.
size(); ++i) {
1030 if (encoding[i] != INVALID_UNICHAR_ID)
1031 unichars[encoding[i]].properties.enabled =
false;
1034 if (unblacklist !=
nullptr && unblacklist[0] !=
'\0') {
1037 encode_string(unblacklist,
false, &encoding,
nullptr,
nullptr);
1038 for (
int i = 0; i < encoding.
size(); ++i) {
1039 if (encoding[i] != INVALID_UNICHAR_ID)
1040 unichars[encoding[i]].properties.enabled =
true;
1050 for (
int id = start_id;
id < size_used; ++id) {
1053 for (
size_t u = 1; u < unicodes.size(); ++u) {
1054 if (unicodes[u - 1] == unicodes[u])
return true;
1061 for (
int i = 0; i < script_table_size_used; ++i) {
1062 if (strcmp(script, script_table[i]) == 0)
1065 if (script_table_size_reserved == 0) {
1066 script_table_size_reserved = 8;
1067 script_table =
new char*[script_table_size_reserved];
1068 }
else if (script_table_size_used >= script_table_size_reserved) {
1069 assert(script_table_size_used == script_table_size_reserved);
1070 script_table_size_reserved += script_table_size_reserved;
1071 char** new_script_table =
new char*[script_table_size_reserved];
1072 memcpy(new_script_table, script_table,
1073 script_table_size_used *
sizeof(
char*));
1074 delete[] script_table;
1075 script_table = new_script_table;
1077 script_table[script_table_size_used] =
new char[strlen(script) + 1];
1078 strcpy(script_table[script_table_size_used], script);
1079 return script_table_size_used++;
1086 if (total == 1)
return STRING(unichar);
1088 result += kSeparator;
1091 snprintf(buffer,
kMaxLen,
"%c%d%c%d", kSeparator, pos,
1092 natural ? kNaturalFlag : kSeparator, total);
1098 const char *ptr = string;
1099 int len = strlen(
string);
1100 if (len <
kMinLen || *ptr != kSeparator) {
1105 while ((ptr + step) < (
string + len) && *(ptr + step) != kSeparator) {
1106 step += UNICHAR::utf8_step(ptr + step);
1112 strncpy(unichar, ptr, step);
1113 unichar[step] =
'\0';
1117 bool natural =
false;
1118 char *end_ptr =
nullptr;
1119 for (
int i = 0; i < 2; i++) {
1120 if (ptr >
string + len || *ptr != kSeparator) {
1121 if (i == 1 && *ptr == kNaturalFlag)
1127 i == 0 ? pos =
static_cast<int>(strtol(ptr, &end_ptr, 10))
1128 : total =
static_cast<int>(strtol(ptr, &end_ptr, 10));
1131 if (ptr !=
string + len) {
1135 fragment->set_all(unichar, pos, total, natural);
1140 for (
int i = 0; i < script_table_size_used; ++i) {
1141 if (strcmp(script_name, script_table[i]) == 0)
1152 result.reserve(length);
1154 while ((ch = *utf8_str) !=
'\0' && length-- > 0) {
1157 while ((key = kCleanupMaps[key_index][0]) !=
nullptr) {
1159 while (key[match] !=
'\0' && key[match] == utf8_str[match]) ++match;
1160 if (key[match] ==
'\0') {
1166 if (key ==
nullptr) {
1167 result.push_back(ch);
1170 result.append(kCleanupMaps[key_index][1]);