Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
mastertrainer.cpp
Go to the documentation of this file.
1 // Copyright 2010 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
4 // File: mastertrainer.cpp
5 // Description: Trainer to build the MasterClassifier.
6 // Author: Ray Smith
7 // Created: Wed Nov 03 18:10:01 PDT 2010
8 //
9 // (C) Copyright 2010, Google Inc.
10 // Licensed under the Apache License, Version 2.0 (the "License");
11 // you may not use this file except in compliance with the License.
12 // You may obtain a copy of the License at
13 // http://www.apache.org/licenses/LICENSE-2.0
14 // Unless required by applicable law or agreed to in writing, software
15 // distributed under the License is distributed on an "AS IS" BASIS,
16 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 // See the License for the specific language governing permissions and
18 // limitations under the License.
19 //
21 
22 // Include automatically generated configuration file if running autoconf.
23 #ifdef HAVE_CONFIG_H
24 #include "config_auto.h"
25 #endif
26 
27 #include "mastertrainer.h"
28 #include <math.h>
29 #include <time.h>
30 #include "allheaders.h"
31 #include "boxread.h"
32 #include "classify.h"
33 #include "errorcounter.h"
34 #include "featdefs.h"
35 #include "sampleiterator.h"
36 #include "shapeclassifier.h"
37 #include "shapetable.h"
38 #include "svmnode.h"
39 
40 namespace tesseract {
41 
42 // Constants controlling clustering. With a low kMinClusteredShapes and a high
43 // kMaxUnicharsPerCluster, then kFontMergeDistance is the only limiting factor.
44 // Min number of shapes in the output.
45 const int kMinClusteredShapes = 1;
46 // Max number of unichars in any individual cluster.
47 const int kMaxUnicharsPerCluster = 2000;
48 // Mean font distance below which to merge fonts and unichars.
49 const float kFontMergeDistance = 0.025;
50 
52  bool shape_analysis,
53  bool replicate_samples,
54  int debug_level)
55  : norm_mode_(norm_mode), samples_(fontinfo_table_),
56  junk_samples_(fontinfo_table_), verify_samples_(fontinfo_table_),
57  charsetsize_(0),
58  enable_shape_anaylsis_(shape_analysis),
59  enable_replication_(replicate_samples),
60  fragments_(NULL), prev_unichar_id_(-1), debug_level_(debug_level) {
61  fontinfo_table_.set_compare_callback(
63  fontinfo_table_.set_clear_callback(
65 }
66 
68  delete [] fragments_;
69  for (int p = 0; p < page_images_.size(); ++p)
70  pixDestroy(&page_images_[p]);
71 }
72 
73 // WARNING! Serialize/DeSerialize are only partial, providing
74 // enough data to get the samples back and display them.
75 // Writes to the given file. Returns false in case of error.
76 bool MasterTrainer::Serialize(FILE* fp) const {
77  if (fwrite(&norm_mode_, sizeof(norm_mode_), 1, fp) != 1) return false;
78  if (!unicharset_.save_to_file(fp)) return false;
79  if (!feature_space_.Serialize(fp)) return false;
80  if (!samples_.Serialize(fp)) return false;
81  if (!junk_samples_.Serialize(fp)) return false;
82  if (!verify_samples_.Serialize(fp)) return false;
83  if (!master_shapes_.Serialize(fp)) return false;
84  if (!flat_shapes_.Serialize(fp)) return false;
85  if (!fontinfo_table_.write(fp, NewPermanentTessCallback(write_info)))
86  return false;
87  if (!fontinfo_table_.write(fp, NewPermanentTessCallback(write_spacing_info)))
88  return false;
89  if (!xheights_.Serialize(fp)) return false;
90  return true;
91 }
92 
93 // Reads from the given file. Returns false in case of error.
94 // If swap is true, assumes a big/little-endian swap is needed.
95 bool MasterTrainer::DeSerialize(bool swap, FILE* fp) {
96  if (fread(&norm_mode_, sizeof(norm_mode_), 1, fp) != 1) return false;
97  if (swap) {
98  ReverseN(&norm_mode_, sizeof(norm_mode_));
99  }
100  if (!unicharset_.load_from_file(fp)) return false;
101  charsetsize_ = unicharset_.size();
102  if (!feature_space_.DeSerialize(swap, fp)) return false;
103  feature_map_.Init(feature_space_);
104  if (!samples_.DeSerialize(swap, fp)) return false;
105  if (!junk_samples_.DeSerialize(swap, fp)) return false;
106  if (!verify_samples_.DeSerialize(swap, fp)) return false;
107  if (!master_shapes_.DeSerialize(swap, fp)) return false;
108  if (!flat_shapes_.DeSerialize(swap, fp)) return false;
109  if (!fontinfo_table_.read(fp, NewPermanentTessCallback(read_info), swap))
110  return false;
111  if (!fontinfo_table_.read(fp, NewPermanentTessCallback(read_spacing_info),
112  swap))
113  return false;
114  if (!xheights_.DeSerialize(swap, fp)) return false;
115  return true;
116 }
117 
118 // Load an initial unicharset, or set one up if the file cannot be read.
120  if (!unicharset_.load_from_file(filename)) {
121  tprintf("Failed to load unicharset from file %s\n"
122  "Building unicharset for training from scratch...\n",
123  filename);
124  unicharset_.clear();
125  // Space character needed to represent NIL_LIST classification.
126  unicharset_.unichar_insert(" ");
127  }
128  charsetsize_ = unicharset_.size();
129  delete [] fragments_;
130  fragments_ = new int[charsetsize_];
131  memset(fragments_, 0, sizeof(*fragments_) * charsetsize_);
132  samples_.LoadUnicharset(filename);
133  junk_samples_.LoadUnicharset(filename);
134  verify_samples_.LoadUnicharset(filename);
135 }
136 
137 // Reads the samples and their features from the given .tr format file,
138 // adding them to the trainer with the font_id from the content of the file.
139 // See mftraining.cpp for a description of the file format.
140 // If verification, then these are verification samples, not training.
143  bool verification) {
144  char buffer[2048];
145  int int_feature_type = ShortNameToFeatureType(feature_defs, kIntFeatureType);
146  int micro_feature_type = ShortNameToFeatureType(feature_defs,
148  int cn_feature_type = ShortNameToFeatureType(feature_defs, kCNFeatureType);
149  int geo_feature_type = ShortNameToFeatureType(feature_defs, kGeoFeatureType);
150 
151  while (fgets(buffer, sizeof(buffer), fp) != NULL) {
152  if (buffer[0] == '\n')
153  continue;
154 
155  char* space = strchr(buffer, ' ');
156  if (space == NULL) {
157  tprintf("Bad format in tr file, reading fontname, unichar\n");
158  continue;
159  }
160  *space++ = '\0';
161  int font_id = GetFontInfoId(buffer);
162  int page_number;
163  STRING unichar;
164  TBOX bounding_box;
165  if (!ParseBoxFileStr(space, &page_number, &unichar, &bounding_box)) {
166  tprintf("Bad format in tr file, reading box coords\n");
167  continue;
168  }
169  CHAR_DESC char_desc = ReadCharDescription(feature_defs, fp);
171  sample->set_font_id(font_id);
172  sample->set_page_num(page_number + page_images_.size());
173  sample->set_bounding_box(bounding_box);
174  sample->ExtractCharDesc(int_feature_type, micro_feature_type,
175  cn_feature_type, geo_feature_type, char_desc);
176  AddSample(verification, unichar.string(), sample);
177  FreeCharDescription(char_desc);
178  }
179  charsetsize_ = unicharset_.size();
180 }
181 
182 // Adds the given single sample to the trainer, setting the classid
183 // appropriately from the given unichar_str.
184 void MasterTrainer::AddSample(bool verification, const char* unichar,
186  if (verification) {
187  verify_samples_.AddSample(unichar, sample);
188  prev_unichar_id_ = -1;
189  } else if (unicharset_.contains_unichar(unichar)) {
190  if (prev_unichar_id_ >= 0)
191  fragments_[prev_unichar_id_] = -1;
192  prev_unichar_id_ = samples_.AddSample(unichar, sample);
193  if (flat_shapes_.FindShape(prev_unichar_id_, sample->font_id()) < 0)
194  flat_shapes_.AddShape(prev_unichar_id_, sample->font_id());
195  } else {
196  int junk_id = junk_samples_.AddSample(unichar, sample);
197  if (prev_unichar_id_ >= 0) {
199  if (frag != NULL && frag->is_natural()) {
200  if (fragments_[prev_unichar_id_] == 0)
201  fragments_[prev_unichar_id_] = junk_id;
202  else if (fragments_[prev_unichar_id_] != junk_id)
203  fragments_[prev_unichar_id_] = -1;
204  }
205  delete frag;
206  }
207  prev_unichar_id_ = -1;
208  }
209 }
210 
211 // Loads all pages from the given tif filename and append to page_images_.
212 // Must be called after ReadTrainingSamples, as the current number of images
213 // is used as an offset for page numbers in the samples.
215  int page;
216  Pix* pix;
217  for (page = 0; (pix = pixReadTiff(filename, page)) != NULL; ++page) {
218  page_images_.push_back(pix);
219  }
220  tprintf("Loaded %d page images from %s\n", page, filename);
221 }
222 
223 // Cleans up the samples after initial load from the tr files, and prior to
224 // saving the MasterTrainer:
225 // Remaps fragmented chars if running shape anaylsis.
226 // Sets up the samples appropriately for class/fontwise access.
227 // Deletes outlier samples.
229  if (debug_level_ > 0)
230  tprintf("PostLoadCleanup...\n");
231  if (enable_shape_anaylsis_)
232  ReplaceFragmentedSamples();
233  SampleIterator sample_it;
234  sample_it.Init(NULL, NULL, true, &verify_samples_);
235  sample_it.NormalizeSamples();
236  verify_samples_.OrganizeByFontAndClass();
237 
238  samples_.IndexFeatures(feature_space_);
239  // TODO(rays) DeleteOutliers is currently turned off to prove NOP-ness
240  // against current training.
241  // samples_.DeleteOutliers(feature_space_, debug_level_ > 0);
242  samples_.OrganizeByFontAndClass();
243  if (debug_level_ > 0)
244  tprintf("ComputeCanonicalSamples...\n");
245  samples_.ComputeCanonicalSamples(feature_map_, debug_level_ > 0);
246 }
247 
248 // Gets the samples ready for training. Use after both
249 // ReadTrainingSamples+PostLoadCleanup or DeSerialize.
250 // Re-indexes the features and computes canonical and cloud features.
252  if (debug_level_ > 0)
253  tprintf("PreTrainingSetup...\n");
254  samples_.IndexFeatures(feature_space_);
255  samples_.ComputeCanonicalFeatures();
256  if (debug_level_ > 0)
257  tprintf("ComputeCloudFeatures...\n");
258  samples_.ComputeCloudFeatures(feature_space_.Size());
259 }
260 
261 // Sets up the master_shapes_ table, which tells which fonts should stay
262 // together until they get to a leaf node classifier.
264  tprintf("Building master shape table\n");
265  int num_fonts = samples_.NumFonts();
266 
267  ShapeTable char_shapes_begin_fragment(samples_.unicharset());
268  ShapeTable char_shapes_end_fragment(samples_.unicharset());
269  ShapeTable char_shapes(samples_.unicharset());
270  for (int c = 0; c < samples_.charsetsize(); ++c) {
271  ShapeTable shapes(samples_.unicharset());
272  for (int f = 0; f < num_fonts; ++f) {
273  if (samples_.NumClassSamples(f, c, true) > 0)
274  shapes.AddShape(c, f);
275  }
276  ClusterShapes(kMinClusteredShapes, 1, kFontMergeDistance, &shapes);
277 
278  const CHAR_FRAGMENT *fragment = samples_.unicharset().get_fragment(c);
279 
280  if (fragment == NULL)
281  char_shapes.AppendMasterShapes(shapes);
282  else if (fragment->is_beginning())
283  char_shapes_begin_fragment.AppendMasterShapes(shapes);
284  else if (fragment->is_ending())
285  char_shapes_end_fragment.AppendMasterShapes(shapes);
286  else
287  char_shapes.AppendMasterShapes(shapes);
288  }
290  kFontMergeDistance, &char_shapes_begin_fragment);
291  char_shapes.AppendMasterShapes(char_shapes_begin_fragment);
293  kFontMergeDistance, &char_shapes_end_fragment);
294  char_shapes.AppendMasterShapes(char_shapes_end_fragment);
296  kFontMergeDistance, &char_shapes);
297  master_shapes_.AppendMasterShapes(char_shapes);
298  tprintf("Master shape_table:%s\n", master_shapes_.SummaryStr().string());
299 }
300 
301 // Adds the junk_samples_ to the main samples_ set. Junk samples are initially
302 // fragments and n-grams (all incorrectly segmented characters).
303 // Various training functions may result in incorrectly segmented characters
304 // being added to the unicharset of the main samples, perhaps because they
305 // form a "radical" decomposition of some (Indic) grapheme, or because they
306 // just look the same as a real character (like rn/m)
307 // This function moves all the junk samples, to the main samples_ set, but
308 // desirable junk, being any sample for which the unichar already exists in
309 // the samples_ unicharset gets the unichar-ids re-indexed to match, but
310 // anything else gets re-marked as unichar_id 0 (space character) to identify
311 // it as junk to the error counter.
313  // Get ids of fragments in junk_samples_ that replace the dead chars.
314  const UNICHARSET& junk_set = junk_samples_.unicharset();
315  const UNICHARSET& sample_set = samples_.unicharset();
316  int num_junks = junk_samples_.num_samples();
317  tprintf("Moving %d junk samples to master sample set.\n", num_junks);
318  for (int s = 0; s < num_junks; ++s) {
319  TrainingSample* sample = junk_samples_.mutable_sample(s);
320  int junk_id = sample->class_id();
321  const char* junk_utf8 = junk_set.id_to_unichar(junk_id);
322  int sample_id = sample_set.unichar_to_id(junk_utf8);
323  if (sample_id == INVALID_UNICHAR_ID)
324  sample_id = 0;
325  sample->set_class_id(sample_id);
326  junk_samples_.extract_sample(s);
327  samples_.AddSample(sample_id, sample);
328  }
329  junk_samples_.DeleteDeadSamples();
330  samples_.OrganizeByFontAndClass();
331 }
332 
333 // Replicates the samples and perturbs them if the enable_replication_ flag
334 // is set. MUST be used after the last call to OrganizeByFontAndClass on
335 // the training samples, ie after IncludeJunk if it is going to be used, as
336 // OrganizeByFontAndClass will eat the replicated samples into the regular
337 // samples.
339  if (enable_replication_) {
340  if (debug_level_ > 0)
341  tprintf("ReplicateAndRandomize...\n");
342  verify_samples_.ReplicateAndRandomizeSamples();
343  samples_.ReplicateAndRandomizeSamples();
344  samples_.IndexFeatures(feature_space_);
345  }
346 }
347 
348 // Loads the basic font properties file into fontinfo_table_.
349 // Returns false on failure.
351  FILE* fp = fopen(filename, "rb");
352  if (fp == NULL) {
353  fprintf(stderr, "Failed to load font_properties from %s\n", filename);
354  return false;
355  }
356  int italic, bold, fixed, serif, fraktur;
357  while (!feof(fp)) {
358  FontInfo fontinfo;
359  char* font_name = new char[1024];
360  fontinfo.name = font_name;
361  fontinfo.properties = 0;
362  fontinfo.universal_id = 0;
363  if (fscanf(fp, "%1024s %i %i %i %i %i\n", font_name,
364  &italic, &bold, &fixed, &serif, &fraktur) != 6)
365  continue;
366  fontinfo.properties =
367  (italic << 0) +
368  (bold << 1) +
369  (fixed << 2) +
370  (serif << 3) +
371  (fraktur << 4);
372  if (!fontinfo_table_.contains(fontinfo)) {
373  fontinfo_table_.push_back(fontinfo);
374  }
375  }
376  fclose(fp);
377  return true;
378 }
379 
380 // Loads the xheight font properties file into xheights_.
381 // Returns false on failure.
383  tprintf("fontinfo table is of size %d\n", fontinfo_table_.size());
384  xheights_.init_to_size(fontinfo_table_.size(), -1);
385  if (filename == NULL) return true;
386  FILE *f = fopen(filename, "rb");
387  if (f == NULL) {
388  fprintf(stderr, "Failed to load font xheights from %s\n", filename);
389  return false;
390  }
391  tprintf("Reading x-heights from %s ...\n", filename);
392  FontInfo fontinfo;
393  fontinfo.properties = 0; // Not used to lookup in the table.
394  fontinfo.universal_id = 0;
395  char buffer[1024];
396  int xht;
397  int total_xheight = 0;
398  int xheight_count = 0;
399  while (!feof(f)) {
400  if (fscanf(f, "%1024s %d\n", buffer, &xht) != 2)
401  continue;
402  fontinfo.name = buffer;
403  if (!fontinfo_table_.contains(fontinfo)) continue;
404  int fontinfo_id = fontinfo_table_.get_id(fontinfo);
405  xheights_[fontinfo_id] = xht;
406  total_xheight += xht;
407  ++xheight_count;
408  }
409  if (xheight_count == 0) {
410  fprintf(stderr, "No valid xheights in %s!\n", filename);
411  return false;
412  }
413  int mean_xheight = DivRounded(total_xheight, xheight_count);
414  for (int i = 0; i < fontinfo_table_.size(); ++i) {
415  if (xheights_[i] < 0)
416  xheights_[i] = mean_xheight;
417  }
418  return true;
419 } // LoadXHeights
420 
421 // Reads spacing stats from filename and adds them to fontinfo_table.
423  FILE* fontinfo_file = fopen(filename, "rb");
424  if (fontinfo_file == NULL)
425  return true; // We silently ignore missing files!
426  // Find the fontinfo_id.
427  int fontinfo_id = GetBestMatchingFontInfoId(filename);
428  if (fontinfo_id < 0) {
429  tprintf("No font found matching fontinfo filename %s\n", filename);
430  fclose(fontinfo_file);
431  return false;
432  }
433  tprintf("Reading spacing from %s for font %d...\n", filename, fontinfo_id);
434  // TODO(rays) scale should probably be a double, but keep as an int for now
435  // to duplicate current behavior.
436  int scale = kBlnXHeight / xheights_[fontinfo_id];
437  int num_unichars;
438  char uch[UNICHAR_LEN];
439  char kerned_uch[UNICHAR_LEN];
440  int x_gap, x_gap_before, x_gap_after, num_kerned;
441  ASSERT_HOST(fscanf(fontinfo_file, "%d\n", &num_unichars) == 1);
442  FontInfo *fi = fontinfo_table_.get_mutable(fontinfo_id);
443  fi->init_spacing(unicharset_.size());
444  FontSpacingInfo *spacing = NULL;
445  for (int l = 0; l < num_unichars; ++l) {
446  if (fscanf(fontinfo_file, "%s %d %d %d",
447  uch, &x_gap_before, &x_gap_after, &num_kerned) != 4) {
448  tprintf("Bad format of font spacing file %s\n", filename);
449  fclose(fontinfo_file);
450  return false;
451  }
452  bool valid = unicharset_.contains_unichar(uch);
453  if (valid) {
454  spacing = new FontSpacingInfo();
455  spacing->x_gap_before = static_cast<inT16>(x_gap_before * scale);
456  spacing->x_gap_after = static_cast<inT16>(x_gap_after * scale);
457  }
458  for (int k = 0; k < num_kerned; ++k) {
459  if (fscanf(fontinfo_file, "%s %d", kerned_uch, &x_gap) != 2) {
460  tprintf("Bad format of font spacing file %s\n", filename);
461  fclose(fontinfo_file);
462  return false;
463  }
464  if (!valid || !unicharset_.contains_unichar(kerned_uch)) continue;
465  spacing->kerned_unichar_ids.push_back(
466  unicharset_.unichar_to_id(kerned_uch));
467  spacing->kerned_x_gaps.push_back(static_cast<inT16>(x_gap * scale));
468  }
469  if (valid) fi->add_spacing(unicharset_.unichar_to_id(uch), spacing);
470  }
471  fclose(fontinfo_file);
472  return true;
473 }
474 
475 // Returns the font id corresponding to the given font name.
476 // Returns -1 if the font cannot be found.
477 int MasterTrainer::GetFontInfoId(const char* font_name) {
478  FontInfo fontinfo;
479  // We are only borrowing the string, so it is OK to const cast it.
480  fontinfo.name = const_cast<char*>(font_name);
481  fontinfo.properties = 0; // Not used to lookup in the table
482  fontinfo.universal_id = 0;
483  if (!fontinfo_table_.contains(fontinfo)) {
484  return -1;
485  } else {
486  return fontinfo_table_.get_id(fontinfo);
487  }
488 }
489 // Returns the font_id of the closest matching font name to the given
490 // filename. It is assumed that a substring of the filename will match
491 // one of the fonts. If more than one is matched, the longest is returned.
493  int fontinfo_id = -1;
494  int best_len = 0;
495  for (int f = 0; f < fontinfo_table_.size(); ++f) {
496  if (strstr(filename, fontinfo_table_.get(f).name) != NULL) {
497  int len = strlen(fontinfo_table_.get(f).name);
498  // Use the longest matching length in case a substring of a font matched.
499  if (len > best_len) {
500  best_len = len;
501  fontinfo_id = f;
502  }
503  }
504  }
505  return fontinfo_id;
506 }
507 
508 // Sets up a flat shapetable with one shape per class/font combination.
510  // To exactly mimic the results of the previous implementation, the shapes
511  // must be clustered in order the fonts arrived, and reverse order of the
512  // characters within each font.
513  // Get a list of the fonts in the order they appeared.
514  GenericVector<int> active_fonts;
515  int num_shapes = flat_shapes_.NumShapes();
516  for (int s = 0; s < num_shapes; ++s) {
517  int font = flat_shapes_.GetShape(s)[0].font_ids[0];
518  int f = 0;
519  for (f = 0; f < active_fonts.size(); ++f) {
520  if (active_fonts[f] == font)
521  break;
522  }
523  if (f == active_fonts.size())
524  active_fonts.push_back(font);
525  }
526  // For each font in order, add all the shapes with that font in reverse order.
527  int num_fonts = active_fonts.size();
528  for (int f = 0; f < num_fonts; ++f) {
529  for (int s = num_shapes - 1; s >= 0; --s) {
530  int font = flat_shapes_.GetShape(s)[0].font_ids[0];
531  if (font == active_fonts[f]) {
532  shape_table->AddShape(flat_shapes_.GetShape(s));
533  }
534  }
535  }
536 }
537 
538 // Sets up a Clusterer for mftraining on a single shape_id.
539 // Call FreeClusterer on the return value after use.
541  const ShapeTable& shape_table,
543  int shape_id,
544  int* num_samples) {
545 
546  int desc_index = ShortNameToFeatureType(feature_defs, kMicroFeatureType);
547  int num_params = feature_defs.FeatureDesc[desc_index]->NumParams;
548  ASSERT_HOST(num_params == MFCount);
549  CLUSTERER* clusterer = MakeClusterer(
550  num_params, feature_defs.FeatureDesc[desc_index]->ParamDesc);
551 
552  // We want to iterate over the samples of just the one shape.
553  IndexMapBiDi shape_map;
554  shape_map.Init(shape_table.NumShapes(), false);
555  shape_map.SetMap(shape_id, true);
556  shape_map.Setup();
557  // Reverse the order of the samples to match the previous behavior.
559  SampleIterator it;
560  it.Init(&shape_map, &shape_table, false, &samples_);
561  for (it.Begin(); !it.AtEnd(); it.Next()) {
562  sample_ptrs.push_back(&it.GetSample());
563  }
564  int sample_id = 0;
565  for (int i = sample_ptrs.size() - 1; i >= 0; --i) {
566  const TrainingSample* sample = sample_ptrs[i];
567  int num_features = sample->num_micro_features();
568  for (int f = 0; f < num_features; ++f)
569  MakeSample(clusterer, sample->micro_features()[f], sample_id);
570  ++sample_id;
571  }
572  *num_samples = sample_id;
573  return clusterer;
574 }
575 
576 // Writes the given float_classes (produced by SetupForFloat2Int) as inttemp
577 // to the given inttemp_file, and the corresponding pffmtable.
578 // The unicharset is the original encoding of graphemes, and shape_set should
579 // match the size of the shape_table, and may possibly be totally fake.
581  const UNICHARSET& shape_set,
582  const ShapeTable& shape_table,
583  CLASS_STRUCT* float_classes,
584  const char* inttemp_file,
585  const char* pffmtable_file) {
586  tesseract::Classify *classify = new tesseract::Classify();
587  // Move the fontinfo table to classify.
588  classify->get_fontinfo_table().move(&fontinfo_table_);
589  INT_TEMPLATES int_templates = classify->CreateIntTemplates(float_classes,
590  shape_set);
591  FILE* fp = fopen(inttemp_file, "wb");
592  classify->WriteIntTemplates(fp, int_templates, shape_set);
593  fclose(fp);
594  // Now write pffmtable. This is complicated by the fact that the adaptive
595  // classifier still wants one indexed by unichar-id, but the static
596  // classifier needs one indexed by its shape class id.
597  // We put the shapetable_cutoffs in a GenericVector, and compute the
598  // unicharset cutoffs along the way.
599  GenericVector<uinT16> shapetable_cutoffs;
600  GenericVector<uinT16> unichar_cutoffs;
601  for (int c = 0; c < unicharset.size(); ++c)
602  unichar_cutoffs.push_back(0);
603  /* then write out each class */
604  for (int i = 0; i < int_templates->NumClasses; ++i) {
605  INT_CLASS Class = ClassForClassId(int_templates, i);
606  // Todo: Test with min instead of max
607  // int MaxLength = LengthForConfigId(Class, 0);
608  uinT16 max_length = 0;
609  for (int config_id = 0; config_id < Class->NumConfigs; config_id++) {
610  // Todo: Test with min instead of max
611  // if (LengthForConfigId (Class, config_id) < MaxLength)
612  uinT16 length = Class->ConfigLengths[config_id];
613  if (length > max_length)
614  max_length = Class->ConfigLengths[config_id];
615  int shape_id = float_classes[i].font_set.get(config_id);
616  const Shape& shape = shape_table.GetShape(shape_id);
617  for (int c = 0; c < shape.size(); ++c) {
618  int unichar_id = shape[c].unichar_id;
619  if (length > unichar_cutoffs[unichar_id])
620  unichar_cutoffs[unichar_id] = length;
621  }
622  }
623  shapetable_cutoffs.push_back(max_length);
624  }
625  fp = fopen(pffmtable_file, "wb");
626  shapetable_cutoffs.Serialize(fp);
627  for (int c = 0; c < unicharset.size(); ++c) {
628  const char *unichar = unicharset.id_to_unichar(c);
629  if (strcmp(unichar, " ") == 0) {
630  unichar = "NULL";
631  }
632  fprintf(fp, "%s %d\n", unichar, unichar_cutoffs[c]);
633  }
634  fclose(fp);
635  free_int_templates(int_templates);
636 }
637 
638 // Generate debug output relating to the canonical distance between the
639 // two given UTF8 grapheme strings.
640 void MasterTrainer::DebugCanonical(const char* unichar_str1,
641  const char* unichar_str2) {
642  int class_id1 = unicharset_.unichar_to_id(unichar_str1);
643  int class_id2 = unicharset_.unichar_to_id(unichar_str2);
644  if (class_id2 == INVALID_UNICHAR_ID)
645  class_id2 = class_id1;
646  if (class_id1 == INVALID_UNICHAR_ID) {
647  tprintf("No unicharset entry found for %s\n", unichar_str1);
648  return;
649  } else {
650  tprintf("Font ambiguities for unichar %d = %s and %d = %s\n",
651  class_id1, unichar_str1, class_id2, unichar_str2);
652  }
653  int num_fonts = samples_.NumFonts();
654  const IntFeatureMap& feature_map = feature_map_;
655  // Iterate the fonts to get the similarity with other fonst of the same
656  // class.
657  tprintf(" ");
658  for (int f = 0; f < num_fonts; ++f) {
659  if (samples_.NumClassSamples(f, class_id2, false) == 0)
660  continue;
661  tprintf("%6d", f);
662  }
663  tprintf("\n");
664  for (int f1 = 0; f1 < num_fonts; ++f1) {
665  // Map the features of the canonical_sample.
666  if (samples_.NumClassSamples(f1, class_id1, false) == 0)
667  continue;
668  tprintf("%4d ", f1);
669  for (int f2 = 0; f2 < num_fonts; ++f2) {
670  if (samples_.NumClassSamples(f2, class_id2, false) == 0)
671  continue;
672  float dist = samples_.ClusterDistance(f1, class_id1, f2, class_id2,
673  feature_map);
674  tprintf(" %5.3f", dist);
675  }
676  tprintf("\n");
677  }
678  // Build a fake ShapeTable containing all the sample types.
679  ShapeTable shapes(unicharset_);
680  for (int f = 0; f < num_fonts; ++f) {
681  if (samples_.NumClassSamples(f, class_id1, true) > 0)
682  shapes.AddShape(class_id1, f);
683  if (class_id1 != class_id2 &&
684  samples_.NumClassSamples(f, class_id2, true) > 0)
685  shapes.AddShape(class_id2, f);
686  }
687 }
688 
689 #ifndef GRAPHICS_DISABLED
690 // Debugging for cloud/canonical features.
691 // Displays a Features window containing:
692 // If unichar_str2 is in the unicharset, and canonical_font is non-negative,
693 // displays the canonical features of the char/font combination in red.
694 // If unichar_str1 is in the unicharset, and cloud_font is non-negative,
695 // displays the cloud feature of the char/font combination in green.
696 // The canonical features are drawn first to show which ones have no
697 // matches in the cloud features.
698 // Until the features window is destroyed, each click in the features window
699 // will display the samples that have that feature in a separate window.
700 void MasterTrainer::DisplaySamples(const char* unichar_str1, int cloud_font,
701  const char* unichar_str2,
702  int canonical_font) {
703  const IntFeatureMap& feature_map = feature_map_;
704  const IntFeatureSpace& feature_space = feature_map.feature_space();
705  ScrollView* f_window = CreateFeatureSpaceWindow("Features", 100, 500);
707  f_window);
708  int class_id2 = samples_.unicharset().unichar_to_id(unichar_str2);
709  if (class_id2 != INVALID_UNICHAR_ID && canonical_font >= 0) {
710  const TrainingSample* sample = samples_.GetCanonicalSample(canonical_font,
711  class_id2);
712  for (int f = 0; f < sample->num_features(); ++f) {
713  RenderIntFeature(f_window, &sample->features()[f], ScrollView::RED);
714  }
715  }
716  int class_id1 = samples_.unicharset().unichar_to_id(unichar_str1);
717  if (class_id1 != INVALID_UNICHAR_ID && cloud_font >= 0) {
718  const BitVector& cloud = samples_.GetCloudFeatures(cloud_font, class_id1);
719  for (int f = 0; f < cloud.size(); ++f) {
720  if (cloud[f]) {
721  INT_FEATURE_STRUCT feature =
722  feature_map.InverseIndexFeature(f);
723  RenderIntFeature(f_window, &feature, ScrollView::GREEN);
724  }
725  }
726  }
727  f_window->Update();
728  ScrollView* s_window = CreateFeatureSpaceWindow("Samples", 100, 500);
729  SVEventType ev_type;
730  do {
731  SVEvent* ev;
732  // Wait until a click or popup event.
733  ev = f_window->AwaitEvent(SVET_ANY);
734  ev_type = ev->type;
735  if (ev_type == SVET_CLICK) {
736  int feature_index = feature_space.XYToFeatureIndex(ev->x, ev->y);
737  if (feature_index >= 0) {
738  // Iterate samples and display those with the feature.
739  Shape shape;
740  shape.AddToShape(class_id1, cloud_font);
741  s_window->Clear();
742  samples_.DisplaySamplesWithFeature(feature_index, shape,
743  feature_space, ScrollView::GREEN,
744  s_window);
745  s_window->Update();
746  }
747  }
748  delete ev;
749  } while (ev_type != SVET_DESTROY);
750 }
751 #endif // GRAPHICS_DISABLED
752 
753 // Tests the given test_classifier on the internal samples.
754 // See TestClassifier for details.
756  bool replicate_samples,
757  ShapeClassifier* test_classifier,
758  STRING* report_string) {
759  TestClassifier(report_level, replicate_samples, &samples_,
760  test_classifier, report_string);
761 }
762 
763 // Tests the given test_classifier on the given samples
764 // report_levels:
765 // 0 = no output.
766 // 1 = bottom-line error rate.
767 // 2 = bottom-line error rate + time.
768 // 3 = font-level error rate + time.
769 // 4 = list of all errors + short classifier debug output on 16 errors.
770 // 5 = list of all errors + short classifier debug output on 25 errors.
771 // If replicate_samples is true, then the test is run on an extended test
772 // sample including replicated and systematically perturbed samples.
773 // If report_string is non-NULL, a summary of the results for each font
774 // is appended to the report_string.
775 double MasterTrainer::TestClassifier(int report_level,
776  bool replicate_samples,
777  TrainingSampleSet* samples,
778  ShapeClassifier* test_classifier,
779  STRING* report_string) {
780  SampleIterator sample_it;
781  sample_it.Init(NULL, test_classifier->GetShapeTable(), replicate_samples,
782  samples);
783  if (report_level > 0) {
784  int num_samples = 0;
785  for (sample_it.Begin(); !sample_it.AtEnd(); sample_it.Next())
786  ++num_samples;
787  tprintf("Iterator has charset size of %d/%d, %d shapes, %d samples\n",
788  sample_it.SparseCharsetSize(), sample_it.CompactCharsetSize(),
789  test_classifier->GetShapeTable()->NumShapes(), num_samples);
790  tprintf("Testing %sREPLICATED:\n", replicate_samples ? "" : "NON-");
791  }
792  double unichar_error = 0.0;
793  ErrorCounter::ComputeErrorRate(test_classifier, report_level,
794  CT_SHAPE_TOP_ERR, fontinfo_table_,
795  page_images_, &sample_it, &unichar_error,
796  NULL, report_string);
797  return unichar_error;
798 }
799 
800 // Returns the average (in some sense) distance between the two given
801 // shapes, which may contain multiple fonts and/or unichars.
802 float MasterTrainer::ShapeDistance(const ShapeTable& shapes, int s1, int s2) {
803  const IntFeatureMap& feature_map = feature_map_;
804  const Shape& shape1 = shapes.GetShape(s1);
805  const Shape& shape2 = shapes.GetShape(s2);
806  int num_chars1 = shape1.size();
807  int num_chars2 = shape2.size();
808  float dist_sum = 0.0f;
809  int dist_count = 0;
810  if (num_chars1 > 1 || num_chars2 > 1) {
811  // In the multi-char case try to optimize the calculation by computing
812  // distances between characters of matching font where possible.
813  for (int c1 = 0; c1 < num_chars1; ++c1) {
814  for (int c2 = 0; c2 < num_chars2; ++c2) {
815  dist_sum += samples_.UnicharDistance(shape1[c1], shape2[c2],
816  true, feature_map);
817  ++dist_count;
818  }
819  }
820  } else {
821  // In the single unichar case, there is little alternative, but to compute
822  // the squared-order distance between pairs of fonts.
823  dist_sum = samples_.UnicharDistance(shape1[0], shape2[0],
824  false, feature_map);
825  ++dist_count;
826  }
827  return dist_sum / dist_count;
828 }
829 
830 // Replaces samples that are always fragmented with the corresponding
831 // fragment samples.
832 void MasterTrainer::ReplaceFragmentedSamples() {
833  if (fragments_ == NULL) return;
834  // Remove samples that are replaced by fragments. Each class that was
835  // always naturally fragmented should be replaced by its fragments.
836  int num_samples = samples_.num_samples();
837  for (int s = 0; s < num_samples; ++s) {
838  TrainingSample* sample = samples_.mutable_sample(s);
839  if (fragments_[sample->class_id()] > 0)
840  samples_.KillSample(sample);
841  }
842  samples_.DeleteDeadSamples();
843 
844  // Get ids of fragments in junk_samples_ that replace the dead chars.
845  const UNICHARSET& frag_set = junk_samples_.unicharset();
846 #if 0
847  // TODO(rays) The original idea was to replace only graphemes that were
848  // always naturally fragmented, but that left a lot of the Indic graphemes
849  // out. Determine whether we can go back to that idea now that spacing
850  // is fixed in the training images, or whether this code is obsolete.
851  bool* good_junk = new bool[frag_set.size()];
852  memset(good_junk, 0, sizeof(*good_junk) * frag_set.size());
853  for (int dead_ch = 1; dead_ch < unicharset_.size(); ++dead_ch) {
854  int frag_ch = fragments_[dead_ch];
855  if (frag_ch <= 0) continue;
856  const char* frag_utf8 = frag_set.id_to_unichar(frag_ch);
858  // Mark the chars for all parts of the fragment as good in good_junk.
859  for (int part = 0; part < frag->get_total(); ++part) {
860  frag->set_pos(part);
861  int good_ch = frag_set.unichar_to_id(frag->to_string().string());
862  if (good_ch != INVALID_UNICHAR_ID)
863  good_junk[good_ch] = true; // We want this one.
864  }
865  }
866 #endif
867  // For now just use all the junk that was from natural fragments.
868  // Get samples of fragments in junk_samples_ that replace the dead chars.
869  int num_junks = junk_samples_.num_samples();
870  for (int s = 0; s < num_junks; ++s) {
871  TrainingSample* sample = junk_samples_.mutable_sample(s);
872  int junk_id = sample->class_id();
873  const char* frag_utf8 = frag_set.id_to_unichar(junk_id);
875  if (frag != NULL && frag->is_natural()) {
876  junk_samples_.extract_sample(s);
877  samples_.AddSample(frag_set.id_to_unichar(junk_id), sample);
878  }
879  }
880  junk_samples_.DeleteDeadSamples();
881  junk_samples_.OrganizeByFontAndClass();
882  samples_.OrganizeByFontAndClass();
883  unicharset_.clear();
884  unicharset_.AppendOtherUnicharset(samples_.unicharset());
885  // delete [] good_junk;
886  // Fragments_ no longer needed?
887  delete [] fragments_;
888  fragments_ = NULL;
889 }
890 
891 // Runs a hierarchical agglomerative clustering to merge shapes in the given
892 // shape_table, while satisfying the given constraints:
893 // * End with at least min_shapes left in shape_table,
894 // * No shape shall have more than max_shape_unichars in it,
895 // * Don't merge shapes where the distance between them exceeds max_dist.
896 const float kInfiniteDist = 999.0f;
897 void MasterTrainer::ClusterShapes(int min_shapes, int max_shape_unichars,
898  float max_dist, ShapeTable* shapes) {
899  int num_shapes = shapes->NumShapes();
900  int max_merges = num_shapes - min_shapes;
901  GenericVector<ShapeDist>* shape_dists =
902  new GenericVector<ShapeDist>[num_shapes];
903  float min_dist = kInfiniteDist;
904  int min_s1 = 0;
905  int min_s2 = 0;
906  tprintf("Computing shape distances...");
907  for (int s1 = 0; s1 < num_shapes; ++s1) {
908  for (int s2 = s1 + 1; s2 < num_shapes; ++s2) {
909  ShapeDist dist(s1, s2, ShapeDistance(*shapes, s1, s2));
910  shape_dists[s1].push_back(dist);
911  if (dist.distance < min_dist) {
912  min_dist = dist.distance;
913  min_s1 = s1;
914  min_s2 = s2;
915  }
916  }
917  tprintf(" %d", s1);
918  }
919  tprintf("\n");
920  int num_merged = 0;
921  while (num_merged < max_merges && min_dist < max_dist) {
922  tprintf("Distance = %f: ", min_dist);
923  int num_unichars = shapes->MergedUnicharCount(min_s1, min_s2);
924  shape_dists[min_s1][min_s2 - min_s1 - 1].distance = kInfiniteDist;
925  if (num_unichars > max_shape_unichars) {
926  tprintf("Merge of %d and %d with %d would exceed max of %d unichars\n",
927  min_s1, min_s2, num_unichars, max_shape_unichars);
928  } else {
929  shapes->MergeShapes(min_s1, min_s2);
930  shape_dists[min_s2].clear();
931  ++num_merged;
932 
933  for (int s = 0; s < min_s1; ++s) {
934  if (!shape_dists[s].empty()) {
935  shape_dists[s][min_s1 - s - 1].distance =
936  ShapeDistance(*shapes, s, min_s1);
937  shape_dists[s][min_s2 - s -1].distance = kInfiniteDist;
938  }
939  }
940  for (int s2 = min_s1 + 1; s2 < num_shapes; ++s2) {
941  if (shape_dists[min_s1][s2 - min_s1 - 1].distance < kInfiniteDist)
942  shape_dists[min_s1][s2 - min_s1 - 1].distance =
943  ShapeDistance(*shapes, min_s1, s2);
944  }
945  for (int s = min_s1 + 1; s < min_s2; ++s) {
946  if (!shape_dists[s].empty()) {
947  shape_dists[s][min_s2 - s - 1].distance = kInfiniteDist;
948  }
949  }
950  }
951  min_dist = kInfiniteDist;
952  for (int s1 = 0; s1 < num_shapes; ++s1) {
953  for (int i = 0; i < shape_dists[s1].size(); ++i) {
954  if (shape_dists[s1][i].distance < min_dist) {
955  min_dist = shape_dists[s1][i].distance;
956  min_s1 = s1;
957  min_s2 = s1 + 1 + i;
958  }
959  }
960  }
961  }
962  tprintf("Stopped with %d merged, min dist %f\n", num_merged, min_dist);
963  delete [] shape_dists;
964  if (debug_level_ > 1) {
965  for (int s1 = 0; s1 < num_shapes; ++s1) {
966  if (shapes->MasterDestinationIndex(s1) == s1) {
967  tprintf("Master shape:%s\n", shapes->DebugStr(s1).string());
968  }
969  }
970  }
971 }
972 
973 
974 } // namespace tesseract.