Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
commontraining.h
Go to the documentation of this file.
1 // Copyright 2008 Google Inc. All Rights Reserved.
2 // Author: scharron@google.com (Samuel Charron)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 #ifndef TESSERACT_TRAINING_COMMONTRAINING_H__
15 #define TESSERACT_TRAINING_COMMONTRAINING_H__
16 
17 #include "oldlist.h"
18 #include "cluster.h"
19 #include "intproto.h"
20 #include "featdefs.h"
21 
22 // Macros to merge tesseract params with command-line flags.
23 #ifdef USE_STD_NAMESPACE
24 #include "params.h"
25 # define INT_PARAM_FLAG(name, val, comment) \
26  INT_VAR(FLAGS_##name, val, comment)
27 # define DECLARE_INT_PARAM_FLAG(name) extern INT_VAR_H(FLAGS_##name, 0, "")
28 # define STRING_PARAM_FLAG(name, val, comment) \
29  STRING_VAR(FLAGS_##name, val, comment)
30 # define DECLARE_STRING_PARAM_FLAG(name) \
31  extern STRING_VAR_H(FLAGS_##name, "", "")
32 # define c_str string
33 #else
34 #include "base/commandlineflags.h"
35 # define INT_PARAM_FLAG(name, val, comment) \
36  DEFINE_int32(name, val, comment)
37 # define DECLARE_INT_PARAM_FLAG(name) DECLARE_int32(name)
38 # define STRING_PARAM_FLAG(name, val, comment) \
39  DEFINE_string(name, val, comment)
40 # define DECLARE_STRING_PARAM_FLAG(name) DECLARE_string(name)
41 #endif
42 
43 namespace tesseract {
44 class Classify;
45 class MasterTrainer;
46 class ShapeTable;
47 }
48 
50 // Globals ///////////////////////////////////////////////////////////////////
52 
54 
55 // Must be defined in the file that "implements" commonTraining facilities.
56 extern CLUSTERCONFIG Config;
57 
59 // Structs ///////////////////////////////////////////////////////////////////
61 typedef struct
62 {
63  char *Label;
67 }
69 
70 typedef struct
71 {
72  char* Label;
73  int NumMerged[MAX_NUM_PROTOS];
77 
78 
80 // Functions /////////////////////////////////////////////////////////////////
82 void ParseArguments(int* argc, char*** argv);
83 
84 namespace tesseract {
85 // Helper loads shape table from the given file.
86 ShapeTable* LoadShapeTable(const STRING& file_prefix);
87 // Helper to write the shape_table.
88 void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table);
89 
90 // Creates a MasterTraininer and loads the training data into it:
91 // Initializes feature_defs and IntegerFX.
92 // Loads the shape_table if shape_table != NULL.
93 // Loads initial unicharset from -U command-line option.
94 // If FLAGS_input_trainer is set, loads the majority of data from there, else:
95 // Loads font info from -F option.
96 // Loads xheights from -X option.
97 // Loads samples from .tr files in remaining command-line args.
98 // Deletes outliers and computes canonical samples.
99 // If FLAGS_output_trainer is set, saves the trainer for future use.
100 // Computes canonical and cloud features.
101 // If shape_table is not NULL, but failed to load, make a fake flat one,
102 // as shape clustering was not run.
103 MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
104  bool replication,
105  ShapeTable** shape_table,
106  STRING* file_prefix);
107 } // namespace tesseract.
108 
109 const char *GetNextFilename(int argc, const char* const * argv);
110 
111 LABELEDLIST FindList(
112  LIST List,
113  char *Label);
114 
115 LABELEDLIST NewLabeledList(
116  const char *Label);
117 
118 void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_defs,
119  const char *feature_name, int max_samples,
120  UNICHARSET* unicharset,
121  FILE* file, LIST* training_samples);
122 
124  const FEATURE_DEFS_STRUCT &FeatureDefs,
125  char *Directory,
126  LIST CharList,
127  const char *program_feature_type);
128 
130  LIST CharList);
131 
132 void FreeLabeledList(
133  LABELEDLIST LabeledList);
134 
136  LIST ClassListList);
137 
139  const FEATURE_DEFS_STRUCT &FeatureDefs,
140  LABELEDLIST CharSample,
141  const char *program_feature_type);
142 
144  LIST ProtoList,
145  BOOL8 KeepSigProtos,
146  BOOL8 KeepInsigProtos,
147  int N);
148 
149 void CleanUpUnusedData(
150  LIST ProtoList);
151 
153  LIST ProtoList,
154  const char *label,
155  CLUSTERER *Clusterer,
156  CLUSTERCONFIG *Config);
157 
159  LIST List,
160  const char *Label);
161 
163  const char *Label);
164 
166  LIST CharList);
167 
168 CLASS_STRUCT* SetUpForFloat2Int(const UNICHARSET& unicharset,
169  LIST LabeledClassList);
170 
171 void Normalize(
172  float *Values);
173 
174 void FreeNormProtoList(
175  LIST CharList);
176 
178  LIST* NormProtoList,
179  LIST ProtoList,
180  char *CharName);
181 
182 int NumberOfProtos(
183  LIST ProtoList,
184  BOOL8 CountSigProtos,
185  BOOL8 CountInsigProtos);
186 
187 
188 void allocNormProtos();
189 #endif // TESSERACT_TRAINING_COMMONTRAINING_H__