31 static const char kSeparator =
'|';
33 static const char kNaturalFlag =
'n';
35 static const int ISALPHA_MASK = 0x1;
36 static const int ISLOWER_MASK = 0x2;
37 static const int ISUPPER_MASK = 0x4;
38 static const int ISDIGIT_MASK = 0x8;
39 static const int ISPUNCTUATION_MASK = 0x10;
44 static const int kMeanlineThreshold = 220;
64 UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() {
69 void UNICHARSET::UNICHAR_PROPERTIES::Init() {
74 ispunctuation =
false;
88 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() {
102 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() {
117 bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty()
const {
118 return min_bottom > max_bottom || min_top > max_top ||
119 min_width > max_width || min_bearing > max_bearing ||
120 min_advance > max_advance;
124 void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom(
125 const UNICHAR_PROPERTIES& src) {
126 UpdateRange(src.min_bottom, &min_bottom, &max_bottom);
127 UpdateRange(src.max_bottom, &min_bottom, &max_bottom);
130 UpdateRange(src.min_width, &min_width, &max_width);
131 UpdateRange(src.max_width, &min_width, &max_width);
132 UpdateRange(src.min_bearing, &min_bearing, &max_bearing);
133 UpdateRange(src.max_bearing, &min_bearing, &max_bearing);
134 UpdateRange(src.min_advance, &min_advance, &max_advance);
135 UpdateRange(src.max_advance, &min_advance, &max_advance);
139 void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(
const UNICHAR_PROPERTIES& src) {
143 fragment = saved_fragment;
152 script_table_size_used(0),
153 null_script(
"NULL") {
162 if (unichars_number > size_reserved) {
163 UNICHAR_SLOT* unichars_new =
new UNICHAR_SLOT[unichars_number];
164 for (
int i = 0; i < size_used; ++i)
165 unichars_new[i] = unichars[i];
166 for (
int j = size_used; j < unichars_number; ++j) {
167 unichars_new[j].properties.script_id =
add_script(null_script);
170 unichars = unichars_new;
171 size_reserved = unichars_number;
184 return ids.
contains(unichar_repr, length) ?
185 ids.
unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
198 int goodlength = minlength;
200 if (str[goodlength] ==
'\0' || ids.
minmatch(str + goodlength) > 0)
206 }
while (str[goodlength] !=
'\0' && goodlength <=
UNICHAR_LEN &&
221 int *first_bad_position)
const {
222 for (
int i = 0, len = strlen(str); i < len; ) {
223 int increment =
step(str + i);
224 if (increment == 0) {
225 if (first_bad_position) *first_bad_position = i;
234 if (
id == INVALID_UNICHAR_ID) {
235 return INVALID_UNICHAR;
238 return unichars[id].representation;
242 if (
id == INVALID_UNICHAR_ID) {
243 return INVALID_UNICHAR;
256 return unichars[id].representation;
266 for (
int i = 0; str[i] !=
'\0'; i +=
step) {
267 char hex[
sizeof(int) * 2 + 1];
271 sprintf(hex,
"%x", str[i]);
319 return (uni >= 0xE000 && uni <= 0xF8FF);
325 for (
int id = 0;
id < size_used; ++id) {
326 unichars[id].properties.SetRangesEmpty();
334 for (
int ch = 0; ch < size_used; ++ch) {
336 UNICHAR_PROPERTIES properties;
337 if (src.GetStrProperties(utf8, &properties)) {
341 const char* other_case = src.
id_to_unichar(properties.other_case);
345 properties.other_case = ch;
347 const char* mirror_str = src.
id_to_unichar(properties.mirror);
351 properties.mirror = ch;
353 unichars[ch].properties.CopyFrom(properties);
362 for (
int ch = 0; ch < size_used; ++ch) {
364 UNICHAR_PROPERTIES properties;
365 if (src.GetStrProperties(utf8, &properties)) {
367 unichars[ch].properties.ExpandRangesFrom(properties);
376 for (
int ch = 0; ch < src.size_used; ++ch) {
377 const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
379 if (strcmp(utf8,
" ") != 0 && src_props.AnyRangeEmpty()) {
381 tprintf(
"Bad properties for char %s: %d,%d %d,%d %d,%d %d,%d %d,%d\n",
382 utf8, src_props.min_bottom, src_props.max_bottom,
383 src_props.min_top, src_props.max_top,
384 src_props.min_width, src_props.max_width,
385 src_props.min_bearing, src_props.max_bearing,
386 src_props.min_advance, src_props.max_advance);
394 unichars[id].properties.SetRangesEmpty();
396 if (!unichars[
id].properties.AnyRangeEmpty()) {
398 unichars[id].properties.ExpandRangesFrom(src_props);
401 unichars[id].properties.CopyFrom(src_props);
404 unichars[id].properties.script_id =
add_script(script);
405 const char* other_case = src.
id_to_unichar(src_props.other_case);
408 unichars[size_used - 1].properties.SetRangesEmpty();
411 unichars[id].properties.other_case =
unichar_to_id(other_case);
412 const char* mirror_str = src.
id_to_unichar(src_props.mirror);
415 unichars[size_used - 1].properties.SetRangesEmpty();
428 bool UNICHARSET::GetStrProperties(
const char* utf8_str,
429 UNICHAR_PROPERTIES* props)
const {
431 props->SetRangesEmpty();
432 props->min_advance = 0;
433 props->max_advance = 0;
435 int total_unicodes = 0;
436 for (
int offset = 0; utf8_str[offset] !=
'\0'; offset += utf8_step) {
437 utf8_step =
step(utf8_str + offset);
438 if (utf8_step == 0)
return false;
440 if (
id < 0)
return false;
441 const UNICHAR_PROPERTIES& src_props = unichars[id].properties;
443 if (src_props.isalpha) props->isalpha =
true;
444 if (src_props.islower) props->islower =
true;
445 if (src_props.isupper) props->isupper =
true;
446 if (src_props.isdigit) props->isdigit =
true;
447 if (src_props.ispunctuation) props->ispunctuation =
true;
448 if (src_props.isngram) props->isngram =
true;
449 if (src_props.enabled) props->enabled =
true;
451 UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom);
452 UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom);
453 UpdateRange(src_props.min_top, &props->min_top, &props->max_top);
454 UpdateRange(src_props.max_top, &props->min_top, &props->max_top);
455 int bearing = props->min_advance + src_props.min_bearing;
456 if (total_unicodes == 0 || bearing < props->min_bearing)
457 props->min_bearing = bearing;
458 bearing = props->max_advance + src_props.max_bearing;
459 if (total_unicodes == 0 || bearing < props->max_bearing)
460 props->max_bearing = bearing;
461 props->min_advance += src_props.min_advance;
462 props->max_advance += src_props.max_advance;
464 props->min_width = src_props.min_width;
465 props->max_width = src_props.max_width;
468 if (total_unicodes == 0) {
469 props->script_id = src_props.script_id;
470 props->other_case = src_props.other_case;
471 props->mirror = src_props.mirror;
472 props->direction = src_props.direction;
476 props->normed += src_props.normed;
479 if (total_unicodes > 1) {
481 props->min_width = props->min_advance - props->max_bearing;
482 props->max_width = props->max_advance - props->min_bearing;
484 return total_unicodes > 0;
488 unsigned int properties = 0;
490 properties |= ISALPHA_MASK;
492 properties |= ISLOWER_MASK;
494 properties |= ISUPPER_MASK;
496 properties |= ISDIGIT_MASK;
498 properties |= ISPUNCTUATION_MASK;
514 fprintf(stderr,
"Utf8 buffer too big, size=%d for %s\n",
515 int(strlen(unichar_repr)), unichar_repr);
518 if (size_used == size_reserved) {
525 strcpy(unichars[size_used].representation, unichar_repr);
532 this->unichars[size_used].properties.fragment = frag;
534 this->unichars[size_used].properties.script_id =
537 this->unichars[size_used].properties.enabled =
true;
538 ids.
insert(unichar_repr, size_used);
552 return ids.
contains(unichar_repr, length);
556 const char*
const unichar_repr)
const {
557 return strcmp(this->
id_to_unichar(unichar_id), unichar_repr) == 0;
561 fprintf(file,
"%d\n", this->
size());
563 int min_bottom, max_bottom, min_top, max_top;
565 int min_width, max_width;
567 int min_bearing, max_bearing;
569 int min_advance, max_advance;
573 fprintf(file,
"%s %x %s %d\n",
"NULL", properties,
578 "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %s %d %d %d %s\t# %s\n",
580 min_bottom, max_bottom, min_top, max_top, min_width, max_width,
581 min_bearing, max_bearing, min_advance, max_advance,
594 : memory_(memory), fgets_ptr_(memory), mem_size_(mem_size) { }
596 char *
fgets(
char *orig_dst,
int size) {
597 const char *src_end = memory_ + mem_size_;
598 char *dst_end = orig_dst + size - 1;
600 return fgets_ptr_ < src_end ? orig_dst :
NULL;
603 char *dst = orig_dst;
605 while (fgets_ptr_ < src_end && dst < dst_end && ch !=
'\n') {
606 ch = *dst++ = *fgets_ptr_++;
609 return (dst == orig_dst) ?
NULL : orig_dst;
614 const char *fgets_ptr_;
619 const char *memory,
int mem_size,
bool skip_fragments) {
623 bool success = load_via_fgets(fgets_cb, skip_fragments);
642 bool success = load_via_fgets(fgets_cb, skip_fragments);
647 bool UNICHARSET::load_via_fgets(
649 bool skip_fragments) {
654 if (fgets_cb->
Run(buffer,
sizeof(buffer)) ==
NULL ||
655 sscanf(buffer,
"%d", &unicharset_size) != 1) {
658 this->
reserve(unicharset_size);
659 for (
UNICHAR_ID id = 0;
id < unicharset_size; ++id) {
661 unsigned int properties;
664 strcpy(script, null_script);
682 if (fgets_cb->
Run(buffer, sizeof (buffer)) ==
NULL ||
684 "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %63s %d %d %d %63s",
685 unichar, &properties,
686 &min_bottom, &max_bottom, &min_top, &max_top,
687 &min_width, &max_width, &min_bearing, &max_bearing,
688 &min_advance, &max_advance, script, &other_case,
689 &direction, &mirror, normed)) != 17 &&
691 "%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %63s %d %d %d",
692 unichar, &properties,
693 &min_bottom, &max_bottom, &min_top, &max_top,
694 &min_width, &max_width, &min_bearing, &max_bearing,
695 &min_advance, &max_advance,
696 script, &other_case, &direction, &mirror)) != 16 &&
697 (v = sscanf(buffer,
"%s %x %d,%d,%d,%d %63s %d %d %d",
698 unichar, &properties,
699 &min_bottom, &max_bottom, &min_top, &max_top,
700 script, &other_case, &direction, &mirror)) != 10 &&
701 (v = sscanf(buffer,
"%s %x %d,%d,%d,%d %63s %d", unichar, &properties,
702 &min_bottom, &max_bottom, &min_top, &max_top,
703 script, &other_case)) != 8 &&
704 (v = sscanf(buffer,
"%s %x %63s %d", unichar, &properties,
705 script, &other_case)) != 4 &&
706 (v = sscanf(buffer,
"%s %x %63s",
707 unichar, &properties, script)) != 3 &&
708 (v = sscanf(buffer,
"%s %x", unichar, &properties) != 2))) {
719 if (strcmp(unichar,
"NULL") == 0)
731 this->unichars[id].properties.enabled =
true;
732 this->
set_top_bottom(
id, min_bottom, max_bottom, min_top, max_top);
736 this->
set_direction(
id, static_cast<UNICHARSET::Direction>(direction));
741 this->
set_normed(
id, (v>16) ? normed : unichar);
753 int net_case_alphas = 0;
754 int x_height_alphas = 0;
755 int cap_height_alphas = 0;
756 top_bottom_set_ =
false;
757 for (
UNICHAR_ID id = 0;
id < size_used; ++id) {
764 top_bottom_set_ =
true;
770 if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
772 else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
777 script_has_upper_lower_ = net_case_alphas > 0;
778 script_has_xheight_ = script_has_upper_lower_ ||
794 int* script_counts =
new int[script_table_size_used];
795 memset(script_counts, 0,
sizeof(*script_counts) * script_table_size_used);
796 for (
int id = 0;
id < size_used; ++id) {
802 for (
int s = 1; s < script_table_size_used; ++s) {
803 if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
806 delete [] script_counts;
816 for (
int id = 0;
id < size_used; ++id) {
823 return rtl_count > ltr_count;
830 const char* whitelist) {
831 bool def_enabled = whitelist ==
NULL || whitelist[0] ==
'\0';
833 for (
int ch = 0; ch < size_used; ++ch)
834 unichars[ch].properties.enabled = def_enabled;
838 for (
int w_ind = 0; whitelist[w_ind] !=
'\0'; w_ind += ch_step) {
839 ch_step =
step(whitelist + w_ind);
842 if (u_id != INVALID_UNICHAR_ID) {
843 unichars[u_id].properties.enabled =
true;
850 if (blacklist !=
NULL && blacklist[0] !=
'\0') {
852 for (
int b_ind = 0; blacklist[b_ind] !=
'\0'; b_ind += ch_step) {
853 ch_step =
step(blacklist + b_ind);
856 if (u_id != INVALID_UNICHAR_ID) {
857 unichars[u_id].properties.enabled =
false;
867 for (
int i = 0; i < script_table_size_used; ++i) {
868 if (strcmp(script, script_table[i]) == 0)
871 if (script_table_size_reserved == 0) {
872 script_table_size_reserved = 8;
873 script_table =
new char*[script_table_size_reserved];
875 if (script_table_size_used + 1 >= script_table_size_reserved) {
876 char** new_script_table =
new char*[script_table_size_reserved * 2];
877 memcpy(new_script_table, script_table, script_table_size_reserved *
sizeof(
char*));
878 delete[] script_table;
879 script_table = new_script_table;
880 script_table_size_reserved = 2 * script_table_size_reserved;
882 script_table[script_table_size_used] =
new char[strlen(script) + 1];
883 strcpy(script_table[script_table_size_used], script);
884 return script_table_size_used++;
891 if (total == 1)
return STRING(unichar);
893 result += kSeparator;
896 snprintf(buffer,
kMaxLen,
"%c%d%c%d", kSeparator, pos,
897 natural ? kNaturalFlag : kSeparator, total);
903 const char *ptr = string;
904 int len = strlen(
string);
905 if (len <
kMinLen || *ptr != kSeparator) {
910 while ((ptr + step) < (
string + len) && *(ptr + step) != kSeparator) {
917 strncpy(unichar, ptr, step);
918 unichar[step] =
'\0';
922 bool natural =
false;
923 char *end_ptr =
NULL;
924 for (
int i = 0; i < 2; i++) {
925 if (ptr >
string + len || *ptr != kSeparator) {
926 if (i == 1 && *ptr == kNaturalFlag)
932 i == 0 ? pos =
static_cast<int>(strtol(ptr, &end_ptr, 10))
933 : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
936 if (ptr !=
string + len) {
940 fragment->
set_all(unichar, pos, total, natural);
945 for (
int i = 0; i < script_table_size_used; ++i) {
946 if (strcmp(script_name, script_table[i]) == 0)