Tesseract
3.02
Main Page
Related Pages
Modules
Namespaces
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Groups
Pages
word_list_lang_model.h
Go to the documentation of this file.
1
/**********************************************************************
2
* File: word_list_lang_model.h
3
* Description: Declaration of the Word List Language Model Class
4
* Author: Ahmad Abdulkader
5
* Created: 2008
6
*
7
* (C) Copyright 2008, Google Inc.
8
** Licensed under the Apache License, Version 2.0 (the "License");
9
** you may not use this file except in compliance with the License.
10
** You may obtain a copy of the License at
11
** http://www.apache.org/licenses/LICENSE-2.0
12
** Unless required by applicable law or agreed to in writing, software
13
** distributed under the License is distributed on an "AS IS" BASIS,
14
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
** See the License for the specific language governing permissions and
16
** limitations under the License.
17
*
18
**********************************************************************/
19
20
// The WordListLangModel class abstracts a language model that is based on
21
// a list of words. It inherits from the LangModel abstract class
22
// Besides providing the methods inherited from the LangModel abstract class,
23
// the class provided methods to add new strings to the Language Model:
24
// AddString & AddString32
25
26
#ifndef WORD_LIST_LANG_MODEL_H
27
#define WORD_LIST_LANG_MODEL_H
28
29
#include <vector>
30
31
#include "
cube_reco_context.h
"
32
#include "
lang_model.h
"
33
#include "
tess_lang_mod_edge.h
"
34
35
namespace
tesseract
{
36
37
class
Trie;
38
39
class
WordListLangModel
:
public
LangModel
{
40
public
:
41
explicit
WordListLangModel
(
CubeRecoContext
*cntxt);
42
~WordListLangModel
();
43
// Returns an edge pointer to the Root
44
LangModEdge
*
Root
();
45
// Returns the edges that fan-out of the specified edge and their count
46
LangModEdge
**
GetEdges
(
CharAltList
*alt_list,
47
LangModEdge
*edge,
48
int
*edge_cnt);
49
// Returns is a sequence of 32-bit characters are valid within this language
50
// model or net. And EndOfWord flag is specified. If true, the sequence has
51
// to end on a valid word. The function also optionally returns the list
52
// of language model edges traversed to parse the string
53
bool
IsValidSequence
(
const
char_32
*sequence,
54
bool
eow_flag,
55
LangModEdge
**edges);
56
bool
IsLeadingPunc
(
char_32
ch) {
return
false
; }
// not yet implemented
57
bool
IsTrailingPunc
(
char_32
ch) {
return
false
; }
// not yet implemented
58
bool
IsDigit
(
char_32
ch) {
return
false
; }
// not yet implemented
59
// Adds a new UTF-8 string to the language model
60
bool
AddString
(
const
char
*char_ptr);
61
// Adds a new UTF-32 string to the language model
62
bool
AddString32
(
const
char_32
*char_32_ptr);
63
// Compute all the variants of a 32-bit string in terms of the class-ids.
64
// This is needed for languages that have ligatures. A word can then have
65
// more than one spelling in terms of the class-ids.
66
static
void
WordVariants
(
const
CharSet
&char_set,
const
UNICHARSET
*uchset,
67
string_32
str32,
68
vector<WERD_CHOICE *> *word_variants);
69
private
:
70
// constants needed to configure the language model
71
static
const
int
kMaxEdge = 512;
72
static
const
int
kMaxDawgEdges = 20000;
73
74
CubeRecoContext
*cntxt_;
75
Trie
*dawg_;
76
bool
init_;
77
// Initialize the language model
78
bool
Init();
79
// Cleanup
80
void
Cleanup();
81
// Recursive helper function for WordVariants().
82
static
void
WordVariants
(
83
const
CharSet
&char_set,
84
string_32
prefix_str32,
WERD_CHOICE
*word_so_far,
85
string_32
str32,
86
vector<WERD_CHOICE *> *word_variants);
87
};
88
}
// tesseract
89
90
#endif // WORD_LIST_LANG_MODEL_H
mnt
data
src
tesseract-ocr
cube
word_list_lang_model.h
Generated on Thu Nov 1 2012 20:19:48 for Tesseract by
1.8.1