Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
UNICHARSET Class Reference

#include <unicharset.h>

List of all members.

Classes

struct  UNICHAR_PROPERTIES
struct  UNICHAR_SLOT

Public Types

enum  Direction {
  U_LEFT_TO_RIGHT = 0, U_RIGHT_TO_LEFT = 1, U_EUROPEAN_NUMBER = 2, U_EUROPEAN_NUMBER_SEPARATOR = 3,
  U_EUROPEAN_NUMBER_TERMINATOR = 4, U_ARABIC_NUMBER = 5, U_COMMON_NUMBER_SEPARATOR = 6, U_BLOCK_SEPARATOR = 7,
  U_SEGMENT_SEPARATOR = 8, U_WHITE_SPACE_NEUTRAL = 9, U_OTHER_NEUTRAL = 10, U_LEFT_TO_RIGHT_EMBEDDING = 11,
  U_LEFT_TO_RIGHT_OVERRIDE = 12, U_RIGHT_TO_LEFT_ARABIC = 13, U_RIGHT_TO_LEFT_EMBEDDING = 14, U_RIGHT_TO_LEFT_OVERRIDE = 15,
  U_POP_DIRECTIONAL_FORMAT = 16, U_DIR_NON_SPACING_MARK = 17, U_BOUNDARY_NEUTRAL = 18, U_CHAR_DIRECTION_COUNT
}

Public Member Functions

 UNICHARSET ()
 ~UNICHARSET ()
const UNICHAR_ID unichar_to_id (const char *const unichar_repr) const
const UNICHAR_ID unichar_to_id (const char *const unichar_repr, int length) const
int step (const char *str) const
bool encodable_string (const char *str, int *first_bad_position) const
const char *const id_to_unichar (UNICHAR_ID id) const
const char *const id_to_unichar_ext (UNICHAR_ID id) const
STRING debug_str (UNICHAR_ID id) const
STRING debug_str (const char *unichar_repr) const
void unichar_insert (const char *const unichar_repr)
bool contains_unichar_id (UNICHAR_ID unichar_id) const
bool contains_unichar (const char *const unichar_repr) const
bool contains_unichar (const char *const unichar_repr, int length) const
bool eq (UNICHAR_ID unichar_id, const char *const unichar_repr) const
void delete_pointers_in_unichars ()
void clear ()
int size () const
void reserve (int unichars_number)
bool save_to_file (const char *const filename) const
bool save_to_file (FILE *file) const
bool load_from_inmemory_file (const char *const memory, int mem_size, bool skip_fragments)
bool load_from_inmemory_file (const char *const memory, int mem_size)
bool load_from_file (const char *const filename, bool skip_fragments)
bool load_from_file (const char *const filename)
bool load_from_file (FILE *file, bool skip_fragments)
bool load_from_file (FILE *file)
void post_load_setup ()
bool major_right_to_left () const
void set_black_and_whitelist (const char *blacklist, const char *whitelist)
void set_isalpha (UNICHAR_ID unichar_id, bool value)
void set_islower (UNICHAR_ID unichar_id, bool value)
void set_isupper (UNICHAR_ID unichar_id, bool value)
void set_isdigit (UNICHAR_ID unichar_id, bool value)
void set_ispunctuation (UNICHAR_ID unichar_id, bool value)
void set_isngram (UNICHAR_ID unichar_id, bool value)
void set_script (UNICHAR_ID unichar_id, const char *value)
void set_other_case (UNICHAR_ID unichar_id, UNICHAR_ID other_case)
void set_direction (UNICHAR_ID unichar_id, UNICHARSET::Direction value)
void set_mirror (UNICHAR_ID unichar_id, UNICHAR_ID mirror)
void set_normed (UNICHAR_ID unichar_id, const char *normed)
bool get_isalpha (UNICHAR_ID unichar_id) const
bool get_islower (UNICHAR_ID unichar_id) const
bool get_isupper (UNICHAR_ID unichar_id) const
bool get_isdigit (UNICHAR_ID unichar_id) const
bool get_ispunctuation (UNICHAR_ID unichar_id) const
bool get_isngram (UNICHAR_ID unichar_id) const
bool get_isprivate (UNICHAR_ID unichar_id) const
bool top_bottom_useful () const
void set_ranges_empty ()
void SetPropertiesFromOther (const UNICHARSET &src)
void ExpandRangesFromOther (const UNICHARSET &src)
void AppendOtherUnicharset (const UNICHARSET &src)
void get_top_bottom (UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
void set_top_bottom (UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
void get_width_range (UNICHAR_ID unichar_id, int *min_width, int *max_width) const
void set_width_range (UNICHAR_ID unichar_id, int min_width, int max_width)
void get_bearing_range (UNICHAR_ID unichar_id, int *min_bearing, int *max_bearing) const
void set_bearing_range (UNICHAR_ID unichar_id, int min_bearing, int max_bearing)
void get_advance_range (UNICHAR_ID unichar_id, int *min_advance, int *max_advance) const
void set_advance_range (UNICHAR_ID unichar_id, int min_advance, int max_advance)
int get_script (UNICHAR_ID unichar_id) const
unsigned int get_properties (UNICHAR_ID unichar_id) const
char get_chartype (UNICHAR_ID unichar_id) const
UNICHAR_ID get_other_case (UNICHAR_ID unichar_id) const
Direction get_direction (UNICHAR_ID unichar_id) const
UNICHAR_ID get_mirror (UNICHAR_ID unichar_id) const
UNICHAR_ID to_lower (UNICHAR_ID unichar_id) const
UNICHAR_ID to_upper (UNICHAR_ID unichar_id) const
const CHAR_FRAGMENTget_fragment (UNICHAR_ID unichar_id) const
bool get_isalpha (const char *const unichar_repr) const
bool get_islower (const char *const unichar_repr) const
bool get_isupper (const char *const unichar_repr) const
bool get_isdigit (const char *const unichar_repr) const
bool get_ispunctuation (const char *const unichar_repr) const
unsigned int get_properties (const char *const unichar_repr) const
char get_chartype (const char *const unichar_repr) const
int get_script (const char *const unichar_repr) const
const CHAR_FRAGMENTget_fragment (const char *const unichar_repr) const
bool get_isalpha (const char *const unichar_repr, int length) const
bool get_islower (const char *const unichar_repr, int length) const
bool get_isupper (const char *const unichar_repr, int length) const
bool get_isdigit (const char *const unichar_repr, int length) const
bool get_ispunctuation (const char *const unichar_repr, int length) const
const char * get_normed_unichar (UNICHAR_ID unichar_id) const
int get_script (const char *const unichar_repr, int length) const
int get_script_table_size () const
const char * get_script_from_script_id (int id) const
int get_script_id_from_name (const char *script_name) const
bool is_null_script (const char *script) const
int add_script (const char *script)
bool get_enabled (UNICHAR_ID unichar_id) const
int null_sid () const
int common_sid () const
int latin_sid () const
int cyrillic_sid () const
int greek_sid () const
int han_sid () const
int hiragana_sid () const
int katakana_sid () const
int default_sid () const
bool script_has_upper_lower () const
bool script_has_xheight () const

Static Public Member Functions

static STRING debug_utf8_str (const char *str)

Static Public Attributes

static const char * kCustomLigatures [][2]

Detailed Description

Definition at line 127 of file unicharset.h.


Member Enumeration Documentation

Enumerator:
U_LEFT_TO_RIGHT 
U_RIGHT_TO_LEFT 
U_EUROPEAN_NUMBER 
U_EUROPEAN_NUMBER_SEPARATOR 
U_EUROPEAN_NUMBER_TERMINATOR 
U_ARABIC_NUMBER 
U_COMMON_NUMBER_SEPARATOR 
U_BLOCK_SEPARATOR 
U_SEGMENT_SEPARATOR 
U_WHITE_SPACE_NEUTRAL 
U_OTHER_NEUTRAL 
U_LEFT_TO_RIGHT_EMBEDDING 
U_LEFT_TO_RIGHT_OVERRIDE 
U_RIGHT_TO_LEFT_ARABIC 
U_RIGHT_TO_LEFT_EMBEDDING 
U_RIGHT_TO_LEFT_OVERRIDE 
U_POP_DIRECTIONAL_FORMAT 
U_DIR_NON_SPACING_MARK 
U_BOUNDARY_NEUTRAL 
U_CHAR_DIRECTION_COUNT 

Definition at line 135 of file unicharset.h.


Constructor & Destructor Documentation

UNICHARSET::UNICHARSET ( )

Definition at line 146 of file unicharset.cpp.

:
unichars(NULL),
ids(),
size_used(0),
size_reserved(0),
script_table(NULL),
script_table_size_used(0),
null_script("NULL") {
clear();
}
UNICHARSET::~UNICHARSET ( )

Definition at line 157 of file unicharset.cpp.

{
clear();
}

Member Function Documentation

int UNICHARSET::add_script ( const char *  script)

Definition at line 866 of file unicharset.cpp.

{
for (int i = 0; i < script_table_size_used; ++i) {
if (strcmp(script, script_table[i]) == 0)
return i;
}
if (script_table_size_reserved == 0) {
script_table_size_reserved = 8;
script_table = new char*[script_table_size_reserved];
}
if (script_table_size_used + 1 >= script_table_size_reserved) {
char** new_script_table = new char*[script_table_size_reserved * 2];
memcpy(new_script_table, script_table, script_table_size_reserved * sizeof(char*));
delete[] script_table;
script_table = new_script_table;
script_table_size_reserved = 2 * script_table_size_reserved;
}
script_table[script_table_size_used] = new char[strlen(script) + 1];
strcpy(script_table[script_table_size_used], script);
return script_table_size_used++;
}
void UNICHARSET::AppendOtherUnicharset ( const UNICHARSET src)

Definition at line 375 of file unicharset.cpp.

{
for (int ch = 0; ch < src.size_used; ++ch) {
const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
const char* utf8 = src.id_to_unichar(ch);
if (strcmp(utf8, " ") != 0 && src_props.AnyRangeEmpty()) {
// Only use fully valid entries.
tprintf("Bad properties for char %s: %d,%d %d,%d %d,%d %d,%d %d,%d\n",
utf8, src_props.min_bottom, src_props.max_bottom,
src_props.min_top, src_props.max_top,
src_props.min_width, src_props.max_width,
src_props.min_bearing, src_props.max_bearing,
src_props.min_advance, src_props.max_advance);
continue;
}
int id = size_used;
if (contains_unichar(utf8)) {
id = unichar_to_id(utf8);
} else {
unichars[id].properties.SetRangesEmpty();
}
if (!unichars[id].properties.AnyRangeEmpty()) {
// Just expand current ranges.
unichars[id].properties.ExpandRangesFrom(src_props);
} else {
// Copy properties from src_props.
unichars[id].properties.CopyFrom(src_props);
// Setup the script_id, other_case and mirror properly.
const char* script = src.get_script_from_script_id(src_props.script_id);
unichars[id].properties.script_id = add_script(script);
const char* other_case = src.id_to_unichar(src_props.other_case);
if (!contains_unichar(other_case)) {
unichar_insert(other_case);
unichars[size_used - 1].properties.SetRangesEmpty();
// Other_case will have its ranges set later as it is contained in src.
}
unichars[id].properties.other_case = unichar_to_id(other_case);
const char* mirror_str = src.id_to_unichar(src_props.mirror);
if (!contains_unichar(mirror_str)) {
unichar_insert(mirror_str);
unichars[size_used - 1].properties.SetRangesEmpty();
// Mirror will have its ranges set later as it is contained in src.
}
unichars[id].properties.mirror = unichar_to_id(mirror_str);
}
}
}
void UNICHARSET::clear ( )
inline

Definition at line 233 of file unicharset.h.

{
if (script_table != NULL) {
for (int i = 0; i < script_table_size_used; ++i)
delete[] script_table[i];
delete[] script_table;
script_table = NULL;
script_table_size_used = 0;
}
if (unichars != NULL) {
delete[] unichars;
unichars = NULL;
}
script_table_size_reserved = 0;
size_reserved = 0;
size_used = 0;
ids.clear();
top_bottom_set_ = false;
script_has_upper_lower_ = false;
script_has_xheight_ = false;
null_sid_ = 0;
common_sid_ = 0;
latin_sid_ = 0;
cyrillic_sid_ = 0;
greek_sid_ = 0;
han_sid_ = 0;
hiragana_sid_ = 0;
katakana_sid_ = 0;
}
int UNICHARSET::common_sid ( ) const
inline

Definition at line 753 of file unicharset.h.

{ return common_sid_; }
bool UNICHARSET::contains_unichar ( const char *const  unichar_repr) const

Definition at line 543 of file unicharset.cpp.

{
return ids.contains(unichar_repr);
}
bool UNICHARSET::contains_unichar ( const char *const  unichar_repr,
int  length 
) const

Definition at line 547 of file unicharset.cpp.

{
if (length == 0) {
return false;
}
return ids.contains(unichar_repr, length);
}
bool UNICHARSET::contains_unichar_id ( UNICHAR_ID  unichar_id) const
inline

Definition at line 209 of file unicharset.h.

{
return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
unichar_id >= 0;
}
int UNICHARSET::cyrillic_sid ( ) const
inline

Definition at line 755 of file unicharset.h.

{ return cyrillic_sid_; }
STRING UNICHARSET::debug_str ( UNICHAR_ID  id) const

Definition at line 285 of file unicharset.cpp.

{
if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
const CHAR_FRAGMENT *fragment = this->get_fragment(id);
if (fragment) {
return fragment->to_string();
}
const char* str = id_to_unichar(id);
STRING result = debug_utf8_str(str);
// Append a for lower alpha, A for upper alpha, and x if alpha but neither.
if (get_isalpha(id)) {
if (get_islower(id))
result += "a";
else if (get_isupper(id))
result += "A";
else
result += "x";
}
// Append 0 if a digit.
if (get_isdigit(id)) {
result += "0";
}
// Append p is a punctuation symbol.
if (get_ispunctuation(id)) {
result += "p";
}
return result;
}
STRING UNICHARSET::debug_str ( const char *  unichar_repr) const
inline

Definition at line 200 of file unicharset.h.

{
return debug_str(unichar_to_id(unichar_repr));
}
STRING UNICHARSET::debug_utf8_str ( const char *  str)
static

Definition at line 261 of file unicharset.cpp.

{
STRING result = str;
result += " [";
int step = 1;
// Chop into unicodes and code each as hex.
for (int i = 0; str[i] != '\0'; i += step) {
char hex[sizeof(int) * 2 + 1];
step = UNICHAR::utf8_step(str + i);
if (step == 0) {
step = 1;
sprintf(hex, "%x", str[i]);
} else {
UNICHAR ch(str + i, step);
sprintf(hex, "%x", ch.first_uni());
}
result += hex;
result += " ";
}
result += "]";
return result;
}
int UNICHARSET::default_sid ( ) const
inline

Definition at line 760 of file unicharset.h.

{ return default_sid_; }
void UNICHARSET::delete_pointers_in_unichars ( )
inline

Definition at line 223 of file unicharset.h.

{
for (int i = 0; i < size_used; ++i) {
if (unichars[i].properties.fragment != NULL) {
delete unichars[i].properties.fragment;
unichars[i].properties.fragment = NULL;
}
}
}
bool UNICHARSET::encodable_string ( const char *  str,
int *  first_bad_position 
) const

Definition at line 220 of file unicharset.cpp.

{
for (int i = 0, len = strlen(str); i < len; ) {
int increment = step(str + i);
if (increment == 0) {
if (first_bad_position) *first_bad_position = i;
return false;
}
i += increment;
}
return true;
}
bool UNICHARSET::eq ( UNICHAR_ID  unichar_id,
const char *const  unichar_repr 
) const

Definition at line 555 of file unicharset.cpp.

{
return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
}
void UNICHARSET::ExpandRangesFromOther ( const UNICHARSET src)

Definition at line 361 of file unicharset.cpp.

{
for (int ch = 0; ch < size_used; ++ch) {
const char* utf8 = id_to_unichar(ch);
UNICHAR_PROPERTIES properties;
if (src.GetStrProperties(utf8, &properties)) {
// Expand just the ranges from properties.
unichars[ch].properties.ExpandRangesFrom(properties);
}
}
}
void UNICHARSET::get_advance_range ( UNICHAR_ID  unichar_id,
int *  min_advance,
int *  max_advance 
) const
inline

Definition at line 531 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) {
*min_advance = *max_advance = 0;
return;
}
*min_advance = unichars[unichar_id].properties.min_advance;
*max_advance = unichars[unichar_id].properties.max_advance;
}
void UNICHARSET::get_bearing_range ( UNICHAR_ID  unichar_id,
int *  min_bearing,
int *  max_bearing 
) const
inline

Definition at line 510 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) {
*min_bearing = *max_bearing = 0;
return;
}
*min_bearing = unichars[unichar_id].properties.min_bearing;
*max_bearing = unichars[unichar_id].properties.max_bearing;
}
char UNICHARSET::get_chartype ( UNICHAR_ID  unichar_id) const

Definition at line 502 of file unicharset.cpp.

{
if (this->get_isupper(id)) return 'A';
if (this->get_islower(id)) return 'a';
if (this->get_isalpha(id)) return 'x';
if (this->get_isdigit(id)) return '0';
if (this->get_ispunctuation(id)) return 'p';
return 0;
}
char UNICHARSET::get_chartype ( const char *const  unichar_repr) const
inline

Definition at line 647 of file unicharset.h.

{
return get_chartype(unichar_to_id(unichar_repr));
}
Direction UNICHARSET::get_direction ( UNICHAR_ID  unichar_id) const
inline

Definition at line 579 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return UNICHARSET::U_OTHER_NEUTRAL;
return unichars[unichar_id].properties.direction;
}
bool UNICHARSET::get_enabled ( UNICHAR_ID  unichar_id) const
inline

Definition at line 747 of file unicharset.h.

{
return unichars[unichar_id].properties.enabled;
}
const CHAR_FRAGMENT* UNICHARSET::get_fragment ( UNICHAR_ID  unichar_id) const
inline

Definition at line 610 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return NULL;
return unichars[unichar_id].properties.fragment;
}
const CHAR_FRAGMENT* UNICHARSET::get_fragment ( const char *const  unichar_repr) const
inline

Definition at line 660 of file unicharset.h.

{
if (unichar_repr == NULL || unichar_repr[0] == '\0' ||
!ids.contains(unichar_repr)) {
return NULL;
}
return get_fragment(unichar_to_id(unichar_repr));
}
bool UNICHARSET::get_isalpha ( UNICHAR_ID  unichar_id) const
inline

Definition at line 392 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return false;
return unichars[unichar_id].properties.isalpha;
}
bool UNICHARSET::get_isalpha ( const char *const  unichar_repr) const
inline

Definition at line 617 of file unicharset.h.

{
return get_isalpha(unichar_to_id(unichar_repr));
}
bool UNICHARSET::get_isalpha ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 670 of file unicharset.h.

{
return get_isalpha(unichar_to_id(unichar_repr, length));
}
bool UNICHARSET::get_isdigit ( UNICHAR_ID  unichar_id) const
inline

Definition at line 413 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return false;
return unichars[unichar_id].properties.isdigit;
}
bool UNICHARSET::get_isdigit ( const char *const  unichar_repr) const
inline

Definition at line 632 of file unicharset.h.

{
return get_isdigit(unichar_to_id(unichar_repr));
}
bool UNICHARSET::get_isdigit ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 691 of file unicharset.h.

{
return get_isdigit(unichar_to_id(unichar_repr, length));
}
bool UNICHARSET::get_islower ( UNICHAR_ID  unichar_id) const
inline

Definition at line 399 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return false;
return unichars[unichar_id].properties.islower;
}
bool UNICHARSET::get_islower ( const char *const  unichar_repr) const
inline

Definition at line 622 of file unicharset.h.

{
return get_islower(unichar_to_id(unichar_repr));
}
bool UNICHARSET::get_islower ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 677 of file unicharset.h.

{
return get_islower(unichar_to_id(unichar_repr, length));
}
bool UNICHARSET::get_isngram ( UNICHAR_ID  unichar_id) const
inline

Definition at line 427 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return false;
return unichars[unichar_id].properties.isngram;
}
bool UNICHARSET::get_isprivate ( UNICHAR_ID  unichar_id) const

Definition at line 316 of file unicharset.cpp.

{
UNICHAR uc(id_to_unichar(unichar_id), -1);
int uni = uc.first_uni();
return (uni >= 0xE000 && uni <= 0xF8FF);
}
bool UNICHARSET::get_ispunctuation ( UNICHAR_ID  unichar_id) const
inline

Definition at line 420 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return false;
return unichars[unichar_id].properties.ispunctuation;
}
bool UNICHARSET::get_ispunctuation ( const char *const  unichar_repr) const
inline

Definition at line 637 of file unicharset.h.

{
return get_ispunctuation(unichar_to_id(unichar_repr));
}
bool UNICHARSET::get_ispunctuation ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 698 of file unicharset.h.

{
return get_ispunctuation(unichar_to_id(unichar_repr, length));
}
bool UNICHARSET::get_isupper ( UNICHAR_ID  unichar_id) const
inline

Definition at line 406 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return false;
return unichars[unichar_id].properties.isupper;
}
bool UNICHARSET::get_isupper ( const char *const  unichar_repr) const
inline

Definition at line 627 of file unicharset.h.

{
return get_isupper(unichar_to_id(unichar_repr));
}
bool UNICHARSET::get_isupper ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 684 of file unicharset.h.

{
return get_isupper(unichar_to_id(unichar_repr, length));
}
UNICHAR_ID UNICHARSET::get_mirror ( UNICHAR_ID  unichar_id) const
inline

Definition at line 586 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
return unichars[unichar_id].properties.mirror;
}
const char* UNICHARSET::get_normed_unichar ( UNICHAR_ID  unichar_id) const
inline

Definition at line 704 of file unicharset.h.

{
return unichars[unichar_id].properties.normed.string();
}
UNICHAR_ID UNICHARSET::get_other_case ( UNICHAR_ID  unichar_id) const
inline

Definition at line 572 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
return unichars[unichar_id].properties.other_case;
}
unsigned int UNICHARSET::get_properties ( UNICHAR_ID  unichar_id) const

Definition at line 487 of file unicharset.cpp.

{
unsigned int properties = 0;
if (this->get_isalpha(id))
properties |= ISALPHA_MASK;
if (this->get_islower(id))
properties |= ISLOWER_MASK;
if (this->get_isupper(id))
properties |= ISUPPER_MASK;
if (this->get_isdigit(id))
properties |= ISDIGIT_MASK;
if (this->get_ispunctuation(id))
properties |= ISPUNCTUATION_MASK;
return properties;
}
unsigned int UNICHARSET::get_properties ( const char *const  unichar_repr) const
inline

Definition at line 643 of file unicharset.h.

{
return get_properties(unichar_to_id(unichar_repr));
}
int UNICHARSET::get_script ( UNICHAR_ID  unichar_id) const
inline

Definition at line 552 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return null_sid_;
return unichars[unichar_id].properties.script_id;
}
int UNICHARSET::get_script ( const char *const  unichar_repr) const
inline

Definition at line 654 of file unicharset.h.

{
return get_script(unichar_to_id(unichar_repr));
}
int UNICHARSET::get_script ( const char *const  unichar_repr,
int  length 
) const
inline

Definition at line 712 of file unicharset.h.

{
return get_script(unichar_to_id(unichar_repr, length));
}
const char* UNICHARSET::get_script_from_script_id ( int  id) const
inline

Definition at line 723 of file unicharset.h.

{
if (id >= script_table_size_used || id < 0)
return null_script;
return script_table[id];
}
int UNICHARSET::get_script_id_from_name ( const char *  script_name) const

Definition at line 944 of file unicharset.cpp.

{
for (int i = 0; i < script_table_size_used; ++i) {
if (strcmp(script_name, script_table[i]) == 0)
return i;
}
return 0; // 0 is always the null_script
}
int UNICHARSET::get_script_table_size ( ) const
inline

Definition at line 718 of file unicharset.h.

{
return script_table_size_used;
}
void UNICHARSET::get_top_bottom ( UNICHAR_ID  unichar_id,
int *  min_bottom,
int *  max_bottom,
int *  min_top,
int *  max_top 
) const
inline

Definition at line 459 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) {
*min_bottom = *min_top = 0;
*max_bottom = *max_top = 256; // kBlnCellHeight
return;
}
*min_bottom = unichars[unichar_id].properties.min_bottom;
*max_bottom = unichars[unichar_id].properties.max_bottom;
*min_top = unichars[unichar_id].properties.min_top;
*max_top = unichars[unichar_id].properties.max_top;
}
void UNICHARSET::get_width_range ( UNICHAR_ID  unichar_id,
int *  min_width,
int *  max_width 
) const
inline

Definition at line 489 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) {
*min_width = 0;
*max_width = 256; // kBlnCellHeight;
return;
}
*min_width = unichars[unichar_id].properties.min_width;
*max_width = unichars[unichar_id].properties.max_width;
}
int UNICHARSET::greek_sid ( ) const
inline

Definition at line 756 of file unicharset.h.

{ return greek_sid_; }
int UNICHARSET::han_sid ( ) const
inline

Definition at line 757 of file unicharset.h.

{ return han_sid_; }
int UNICHARSET::hiragana_sid ( ) const
inline

Definition at line 758 of file unicharset.h.

{ return hiragana_sid_; }
const char *const UNICHARSET::id_to_unichar ( UNICHAR_ID  id) const

Definition at line 233 of file unicharset.cpp.

{
if (id == INVALID_UNICHAR_ID) {
return INVALID_UNICHAR;
}
ASSERT_HOST(id < this->size());
return unichars[id].representation;
}
const char *const UNICHARSET::id_to_unichar_ext ( UNICHAR_ID  id) const

Definition at line 241 of file unicharset.cpp.

{
if (id == INVALID_UNICHAR_ID) {
return INVALID_UNICHAR;
}
ASSERT_HOST(id < this->size());
// Resolve from the kCustomLigatures table if this is a private encoding.
if (get_isprivate(id)) {
const char* ch = id_to_unichar(id);
for (int i = 0; kCustomLigatures[i][0] != NULL; ++i) {
if (!strcmp(ch, kCustomLigatures[i][1])) {
return kCustomLigatures[i][0];
}
}
}
// Otherwise return the stored representation.
return unichars[id].representation;
}
bool UNICHARSET::is_null_script ( const char *  script) const
inline

Definition at line 737 of file unicharset.h.

{
return script == null_script;
}
int UNICHARSET::katakana_sid ( ) const
inline

Definition at line 759 of file unicharset.h.

{ return katakana_sid_; }
int UNICHARSET::latin_sid ( ) const
inline

Definition at line 754 of file unicharset.h.

{ return latin_sid_; }
bool UNICHARSET::load_from_file ( const char *const  filename,
bool  skip_fragments 
)
inline

Definition at line 298 of file unicharset.h.

{
FILE* file = fopen(filename, "rb");
if (file == NULL) return false;
bool result = load_from_file(file, skip_fragments);
fclose(file);
return result;
}
bool UNICHARSET::load_from_file ( const char *const  filename)
inline

Definition at line 306 of file unicharset.h.

{
return load_from_file(filename, false);
}
bool UNICHARSET::load_from_file ( FILE *  file,
bool  skip_fragments 
)

Definition at line 638 of file unicharset.cpp.

{
LocalFilePointer lfp(file);
bool success = load_via_fgets(fgets_cb, skip_fragments);
delete fgets_cb;
return success;
}
bool UNICHARSET::load_from_file ( FILE *  file)
inline

Definition at line 313 of file unicharset.h.

{ return load_from_file(file, false); }
bool UNICHARSET::load_from_inmemory_file ( const char *const  memory,
int  mem_size,
bool  skip_fragments 
)

Definition at line 618 of file unicharset.cpp.

{
InMemoryFilePointer mem_fp(memory, mem_size);
bool success = load_via_fgets(fgets_cb, skip_fragments);
delete fgets_cb;
return success;
}
bool UNICHARSET::load_from_inmemory_file ( const char *const  memory,
int  mem_size 
)
inline

Definition at line 291 of file unicharset.h.

{
return load_from_inmemory_file(memory, mem_size, false);
}
bool UNICHARSET::major_right_to_left ( ) const

Definition at line 813 of file unicharset.cpp.

{
int ltr_count = 0;
int rtl_count = 0;
for (int id = 0; id < size_used; ++id) {
int dir = get_direction(id);
if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
}
return rtl_count > ltr_count;
}
int UNICHARSET::null_sid ( ) const
inline

Definition at line 752 of file unicharset.h.

{ return null_sid_; }
void UNICHARSET::post_load_setup ( )

Definition at line 750 of file unicharset.cpp.

{
// Number of alpha chars with the case property minus those without,
// in order to determine that half the alpha chars have case.
int net_case_alphas = 0;
int x_height_alphas = 0;
int cap_height_alphas = 0;
top_bottom_set_ = false;
for (UNICHAR_ID id = 0; id < size_used; ++id) {
int min_bottom = 0;
int max_bottom = MAX_UINT8;
int min_top = 0;
int max_top = MAX_UINT8;
get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
if (min_top > 0)
top_bottom_set_ = true;
if (get_isalpha(id)) {
if (get_islower(id) || get_isupper(id))
++net_case_alphas;
else
--net_case_alphas;
if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
++x_height_alphas;
else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
++cap_height_alphas;
}
}
script_has_upper_lower_ = net_case_alphas > 0;
script_has_xheight_ = script_has_upper_lower_ ||
(x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
null_sid_ = get_script_id_from_name(null_script);
ASSERT_HOST(null_sid_ == 0);
common_sid_ = get_script_id_from_name("Common");
latin_sid_ = get_script_id_from_name("Latin");
cyrillic_sid_ = get_script_id_from_name("Cyrillic");
greek_sid_ = get_script_id_from_name("Greek");
han_sid_ = get_script_id_from_name("Han");
hiragana_sid_ = get_script_id_from_name("Hiragana");
katakana_sid_ = get_script_id_from_name("Katakana");
// Compute default script. Use the highest-counting alpha script, that is
// not the common script, as that still contains some "alphas".
int* script_counts = new int[script_table_size_used];
memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
for (int id = 0; id < size_used; ++id) {
if (get_isalpha(id)) {
++script_counts[get_script(id)];
}
}
default_sid_ = 0;
for (int s = 1; s < script_table_size_used; ++s) {
if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
default_sid_ = s;
}
delete [] script_counts;
}
void UNICHARSET::reserve ( int  unichars_number)

Definition at line 161 of file unicharset.cpp.

{
if (unichars_number > size_reserved) {
UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
for (int i = 0; i < size_used; ++i)
unichars_new[i] = unichars[i];
for (int j = size_used; j < unichars_number; ++j) {
unichars_new[j].properties.script_id = add_script(null_script);
}
delete[] unichars;
unichars = unichars_new;
size_reserved = unichars_number;
}
}
bool UNICHARSET::save_to_file ( const char *const  filename) const
inline

Definition at line 273 of file unicharset.h.

{
FILE* file = fopen(filename, "w+b");
if (file == NULL) return false;
bool result = save_to_file(file);
fclose(file);
return result;
}
bool UNICHARSET::save_to_file ( FILE *  file) const

Definition at line 560 of file unicharset.cpp.

{
fprintf(file, "%d\n", this->size());
for (UNICHAR_ID id = 0; id < this->size(); ++id) {
int min_bottom, max_bottom, min_top, max_top;
get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
int min_width, max_width;
get_width_range(id, &min_width, &max_width);
int min_bearing, max_bearing;
get_bearing_range(id, &min_bearing, &max_bearing);
int min_advance, max_advance;
get_advance_range(id, &min_advance, &max_advance);
unsigned int properties = this->get_properties(id);
if (strcmp(this->id_to_unichar(id), " ") == 0) {
fprintf(file, "%s %x %s %d\n", "NULL", properties,
this->get_other_case(id));
} else {
fprintf(file,
"%s %x %d,%d,%d,%d,%d,%d,%d,%d,%d,%d %s %d %d %d %s\t# %s\n",
this->id_to_unichar(id), properties,
min_bottom, max_bottom, min_top, max_top, min_width, max_width,
min_bearing, max_bearing, min_advance, max_advance,
this->get_other_case(id), this->get_direction(id),
this->get_mirror(id), this->get_normed_unichar(id),
this->debug_str(id).string());
}
}
return true;
}
bool UNICHARSET::script_has_upper_lower ( ) const
inline

Definition at line 763 of file unicharset.h.

{
return script_has_upper_lower_;
}
bool UNICHARSET::script_has_xheight ( ) const
inline

Definition at line 770 of file unicharset.h.

{
return script_has_xheight_;
}
void UNICHARSET::set_advance_range ( UNICHAR_ID  unichar_id,
int  min_advance,
int  max_advance 
)
inline

Definition at line 541 of file unicharset.h.

{
unichars[unichar_id].properties.min_advance =
static_cast<inT16>(ClipToRange(min_advance, 0, MAX_INT16));
unichars[unichar_id].properties.max_advance =
static_cast<inT16>(ClipToRange(max_advance, 0, MAX_INT16));
}
void UNICHARSET::set_bearing_range ( UNICHAR_ID  unichar_id,
int  min_bearing,
int  max_bearing 
)
inline

Definition at line 520 of file unicharset.h.

{
unichars[unichar_id].properties.min_bearing =
static_cast<inT16>(ClipToRange(min_bearing, 0, MAX_INT16));
unichars[unichar_id].properties.max_bearing =
static_cast<inT16>(ClipToRange(max_bearing, 0, MAX_INT16));
}
void UNICHARSET::set_black_and_whitelist ( const char *  blacklist,
const char *  whitelist 
)

Definition at line 829 of file unicharset.cpp.

{
bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
// Set everything to default
for (int ch = 0; ch < size_used; ++ch)
unichars[ch].properties.enabled = def_enabled;
int ch_step;
if (!def_enabled) {
// Enable the whitelist.
for (int w_ind = 0; whitelist[w_ind] != '\0'; w_ind += ch_step) {
ch_step = step(whitelist + w_ind);
if (ch_step > 0) {
UNICHAR_ID u_id = unichar_to_id(whitelist + w_ind, ch_step);
if (u_id != INVALID_UNICHAR_ID) {
unichars[u_id].properties.enabled = true;
}
} else {
ch_step = 1;
}
}
}
if (blacklist != NULL && blacklist[0] != '\0') {
// Disable the blacklist.
for (int b_ind = 0; blacklist[b_ind] != '\0'; b_ind += ch_step) {
ch_step = step(blacklist + b_ind);
if (ch_step > 0) {
UNICHAR_ID u_id = unichar_to_id(blacklist + b_ind, ch_step);
if (u_id != INVALID_UNICHAR_ID) {
unichars[u_id].properties.enabled = false;
}
} else {
ch_step = 1;
}
}
}
}
void UNICHARSET::set_direction ( UNICHAR_ID  unichar_id,
UNICHARSET::Direction  value 
)
inline

Definition at line 377 of file unicharset.h.

{
unichars[unichar_id].properties.direction = value;
}
void UNICHARSET::set_isalpha ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 336 of file unicharset.h.

{
unichars[unichar_id].properties.isalpha = value;
}
void UNICHARSET::set_isdigit ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 351 of file unicharset.h.

{
unichars[unichar_id].properties.isdigit = value;
}
void UNICHARSET::set_islower ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 341 of file unicharset.h.

{
unichars[unichar_id].properties.islower = value;
}
void UNICHARSET::set_isngram ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 361 of file unicharset.h.

{
unichars[unichar_id].properties.isngram = value;
}
void UNICHARSET::set_ispunctuation ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 356 of file unicharset.h.

{
unichars[unichar_id].properties.ispunctuation = value;
}
void UNICHARSET::set_isupper ( UNICHAR_ID  unichar_id,
bool  value 
)
inline

Definition at line 346 of file unicharset.h.

{
unichars[unichar_id].properties.isupper = value;
}
void UNICHARSET::set_mirror ( UNICHAR_ID  unichar_id,
UNICHAR_ID  mirror 
)
inline

Definition at line 382 of file unicharset.h.

{
unichars[unichar_id].properties.mirror = mirror;
}
void UNICHARSET::set_normed ( UNICHAR_ID  unichar_id,
const char *  normed 
)
inline

Definition at line 387 of file unicharset.h.

{
unichars[unichar_id].properties.normed = normed;
}
void UNICHARSET::set_other_case ( UNICHAR_ID  unichar_id,
UNICHAR_ID  other_case 
)
inline

Definition at line 372 of file unicharset.h.

{
unichars[unichar_id].properties.other_case = other_case;
}
void UNICHARSET::set_ranges_empty ( )

Definition at line 324 of file unicharset.cpp.

{
for (int id = 0; id < size_used; ++id) {
unichars[id].properties.SetRangesEmpty();
}
}
void UNICHARSET::set_script ( UNICHAR_ID  unichar_id,
const char *  value 
)
inline

Definition at line 367 of file unicharset.h.

{
unichars[unichar_id].properties.script_id = add_script(value);
}
void UNICHARSET::set_top_bottom ( UNICHAR_ID  unichar_id,
int  min_bottom,
int  max_bottom,
int  min_top,
int  max_top 
)
inline

Definition at line 473 of file unicharset.h.

{
unichars[unichar_id].properties.min_bottom =
static_cast<uinT8>(ClipToRange(min_bottom, 0, MAX_UINT8));
unichars[unichar_id].properties.max_bottom =
static_cast<uinT8>(ClipToRange(max_bottom, 0, MAX_UINT8));
unichars[unichar_id].properties.min_top =
static_cast<uinT8>(ClipToRange(min_top, 0, MAX_UINT8));
unichars[unichar_id].properties.max_top =
static_cast<uinT8>(ClipToRange(max_top, 0, MAX_UINT8));
}
void UNICHARSET::set_width_range ( UNICHAR_ID  unichar_id,
int  min_width,
int  max_width 
)
inline

Definition at line 500 of file unicharset.h.

{
unichars[unichar_id].properties.min_width =
static_cast<inT16>(ClipToRange(min_width, 0, MAX_INT16));
unichars[unichar_id].properties.max_width =
static_cast<inT16>(ClipToRange(max_width, 0, MAX_INT16));
}
void UNICHARSET::SetPropertiesFromOther ( const UNICHARSET src)

Definition at line 333 of file unicharset.cpp.

{
for (int ch = 0; ch < size_used; ++ch) {
const char* utf8 = id_to_unichar(ch);
UNICHAR_PROPERTIES properties;
if (src.GetStrProperties(utf8, &properties)) {
// Setup the script_id, other_case, and mirror properly.
const char* script = src.get_script_from_script_id(properties.script_id);
properties.script_id = add_script(script);
const char* other_case = src.id_to_unichar(properties.other_case);
if (contains_unichar(other_case)) {
properties.other_case = unichar_to_id(other_case);
} else {
properties.other_case = ch;
}
const char* mirror_str = src.id_to_unichar(properties.mirror);
if (contains_unichar(mirror_str)) {
properties.mirror = unichar_to_id(mirror_str);
} else {
properties.mirror = ch;
}
unichars[ch].properties.CopyFrom(properties);
}
}
}
int UNICHARSET::size ( ) const
inline

Definition at line 264 of file unicharset.h.

{
return size_used;
}
int UNICHARSET::step ( const char *  str) const

Definition at line 192 of file unicharset.cpp.

{
// Find the length of the first matching unicharset member.
int minlength = ids.minmatch(str);
if (minlength == 0)
return 0; // Empty string or illegal char.
int goodlength = minlength;
while (goodlength <= UNICHAR_LEN) {
if (str[goodlength] == '\0' || ids.minmatch(str + goodlength) > 0)
return goodlength; // This length works!
// The next char is illegal so find the next usable length.
do {
++goodlength;
} while (str[goodlength] != '\0' && goodlength <= UNICHAR_LEN &&
!ids.contains(str, goodlength));
if (goodlength > UNICHAR_LEN || !ids.contains(str, goodlength)) {
// This does not constitute a good length!
return minlength;
}
}
// Search to find a subsequent legal char failed so return the minlength.
return minlength;
}
UNICHAR_ID UNICHARSET::to_lower ( UNICHAR_ID  unichar_id) const
inline

Definition at line 593 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
if (unichars[unichar_id].properties.islower) return unichar_id;
return unichars[unichar_id].properties.other_case;
}
UNICHAR_ID UNICHARSET::to_upper ( UNICHAR_ID  unichar_id) const
inline

Definition at line 601 of file unicharset.h.

{
if (INVALID_UNICHAR_ID == unichar_id) return INVALID_UNICHAR_ID;
if (unichars[unichar_id].properties.isupper) return unichar_id;
return unichars[unichar_id].properties.other_case;
}
bool UNICHARSET::top_bottom_useful ( ) const
inline

Definition at line 438 of file unicharset.h.

{
return top_bottom_set_;
}
void UNICHARSET::unichar_insert ( const char *const  unichar_repr)

Definition at line 511 of file unicharset.cpp.

{
if (!ids.contains(unichar_repr)) {
if (strlen(unichar_repr) > UNICHAR_LEN) {
fprintf(stderr, "Utf8 buffer too big, size=%d for %s\n",
int(strlen(unichar_repr)), unichar_repr);
return;
}
if (size_used == size_reserved) {
if (size_used == 0)
reserve(8);
else
reserve(2 * size_used);
}
strcpy(unichars[size_used].representation, unichar_repr);
this->set_script(size_used, null_script);
// If the given unichar_repr represents a fragmented character, set
// fragment property to a pointer to CHAR_FRAGMENT class instance with
// information parsed from the unichar representation. Use the script
// of the base unichar for the fragmented character if possible.
this->unichars[size_used].properties.fragment = frag;
if (frag != NULL && this->contains_unichar(frag->get_unichar())) {
this->unichars[size_used].properties.script_id =
this->get_script(frag->get_unichar());
}
this->unichars[size_used].properties.enabled = true;
ids.insert(unichar_repr, size_used);
++size_used;
}
}
const UNICHAR_ID UNICHARSET::unichar_to_id ( const char *const  unichar_repr) const

Definition at line 176 of file unicharset.cpp.

{
return ids.contains(unichar_repr) ?
ids.unichar_to_id(unichar_repr) : INVALID_UNICHAR_ID;
}
const UNICHAR_ID UNICHARSET::unichar_to_id ( const char *const  unichar_repr,
int  length 
) const

Definition at line 181 of file unicharset.cpp.

{
assert(length > 0 && length <= UNICHAR_LEN);
return ids.contains(unichar_repr, length) ?
ids.unichar_to_id(unichar_repr, length) : INVALID_UNICHAR_ID;
}

Member Data Documentation

const char * UNICHARSET::kCustomLigatures
static
Initial value:
{
{"ct", "\uE003"},
{"ſh", "\uE006"},
{"ſi", "\uE007"},
{"ſl", "\uE008"},
{"ſſ", "\uE009"},
}

Definition at line 132 of file unicharset.h.


The documentation for this class was generated from the following files: