Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
segsearch.cpp
Go to the documentation of this file.
1 
2 // File: segsearch.h
3 // Description: Segmentation search functions.
4 // Author: Daria Antonova
5 // Created: Mon Jun 23 11:26:43 PDT 2008
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #include "wordrec.h"
21 
22 #include "associate.h"
23 #include "baseline.h"
24 #include "language_model.h"
25 #include "matrix.h"
26 #include "oldheap.h"
27 #include "params.h"
28 #include "ratngs.h"
29 #include "states.h"
30 
32 
33 namespace tesseract {
34 
35 void Wordrec::SegSearch(CHUNKS_RECORD *chunks_record,
36  WERD_CHOICE *best_choice,
37  BLOB_CHOICE_LIST_VECTOR *best_char_choices,
38  WERD_CHOICE *raw_choice,
39  STATE *output_best_state,
40  BlamerBundle *blamer_bundle) {
41  int row, col = 0;
42  if (segsearch_debug_level > 0) {
43  tprintf("Starting SegSearch on ratings matrix:\n");
44  chunks_record->ratings->print(getDict().getUnicharset());
45  }
46  // Start with a fresh best_choice since rating adjustments
47  // used by the chopper and the new segmentation search are not compatible.
48  best_choice->set_rating(WERD_CHOICE::kBadRating);
49  // TODO(antonova): Due to the fact that we currently do not re-start the
50  // segmentation search from the best choice the chopper found, sometimes
51  // the the segmentation search does not find the best path (that chopper
52  // did discover) and does not have a chance to adapt to it. As soon as we
53  // transition to using new-style language model penalties in the chopper
54  // this issue will be resolved. But for how we are forced clear the
55  // accumulator choices.
56  //
57  // Clear best choice accumulator (that is used for adaption), so that
58  // choices adjusted by chopper do not interfere with the results from the
59  // segmentation search.
61 
62  MATRIX *ratings = chunks_record->ratings;
63  // Priority queue containing pain points generated by the language model
64  // The priority is set by the language model components, adjustments like
65  // seam cost and width priority are factored into the priority.
66  HEAP *pain_points = MakeHeap(segsearch_max_pain_points);
67 
68  // best_path_by_column records the lowest cost path found so far for each
69  // column of the chunks_record->ratings matrix over all the rows.
70  BestPathByColumn *best_path_by_column =
71  new BestPathByColumn[ratings->dimension()];
72  for (col = 0; col < ratings->dimension(); ++col) {
73  best_path_by_column[col].avg_cost = WERD_CHOICE::kBadRating;
74  best_path_by_column[col].best_vse = NULL;
75  }
76 
77  // Compute scaling factor that will help us recover blob outline length
78  // from classifier rating and certainty for the blob.
79  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
80 
83  best_choice->certainty(),
84  segsearch_max_char_wh_ratio, rating_cert_scale,
85  pain_points, chunks_record, blamer_bundle,
87 
88  MATRIX_COORD *pain_point;
89  float pain_point_priority;
90  BestChoiceBundle best_choice_bundle(
91  output_best_state, best_choice, raw_choice, best_char_choices);
92 
93  // pending[i] stores a list of the parent/child pair of BLOB_CHOICE_LISTs,
94  // where i is the column of the child. Initially all the classified entries
95  // in the ratings matrix from column 0 (with parent NULL) are inserted into
96  // pending[0]. As the language model state is updated, new child/parent
97  // pairs are inserted into the lists. Next, the entries in pending[1] are
98  // considered, and so on. It is important that during the update the
99  // children are considered in the non-decreasing order of their column, since
100  // this guarantees that all the parents would be up to date before an update
101  // of a child is done.
102  SEG_SEARCH_PENDING_LIST *pending =
103  new SEG_SEARCH_PENDING_LIST[ratings->dimension()];
104 
105  // Search for the ratings matrix for the initial best path.
106  for (row = 0; row < ratings->dimension(); ++row) {
107  if (ratings->get(0, row) != NOT_CLASSIFIED) {
108  pending[0].add_sorted(
111  }
112  }
113  UpdateSegSearchNodes(0, &pending, &best_path_by_column, chunks_record,
114  pain_points, &best_choice_bundle, blamer_bundle);
115 
116  // Keep trying to find a better path by fixing the "pain points".
117  int num_futile_classifications = 0;
118  STRING blamer_debug;
119  while (!SegSearchDone(num_futile_classifications) ||
120  (blamer_bundle != NULL &&
121  blamer_bundle->segsearch_is_looking_for_blame)) {
122  // Get the next valid "pain point".
123  int pop;
124  while (true) {
125  pop = HeapPop(pain_points, &pain_point_priority, &pain_point);
126  if (pop == EMPTY) break;
127  if (pain_point->Valid(*ratings) &&
128  ratings->get(pain_point->col, pain_point->row) == NOT_CLASSIFIED) {
129  break;
130  } else {
131  delete pain_point;
132  }
133  }
134  if (pop == EMPTY) {
135  if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
136  break;
137  }
138  ProcessSegSearchPainPoint(pain_point_priority, *pain_point,
139  best_choice_bundle.best_choice, &pending,
140  chunks_record, pain_points, blamer_bundle);
141 
142  UpdateSegSearchNodes(pain_point->col, &pending, &best_path_by_column,
143  chunks_record, pain_points, &best_choice_bundle,
144  blamer_bundle);
145  if (!best_choice_bundle.updated) ++num_futile_classifications;
146 
147  if (segsearch_debug_level > 0) {
148  tprintf("num_futile_classifications %d\n", num_futile_classifications);
149  }
150 
151  best_choice_bundle.updated = false; // reset updated
152  delete pain_point; // done using this pain point
153 
154  // See if it's time to terminate SegSearch or time for starting a guided
155  // search for the true path to find the blame for the incorrect best_choice.
156  if (SegSearchDone(num_futile_classifications) && blamer_bundle != NULL &&
157  blamer_bundle->incorrect_result_reason == IRR_CORRECT &&
158  !blamer_bundle->segsearch_is_looking_for_blame &&
159  blamer_bundle->truth_has_char_boxes &&
160  !ChoiceIsCorrect(getDict().getUnicharset(),
161  best_choice, blamer_bundle->truth_text)) {
162  InitBlamerForSegSearch(best_choice_bundle.best_choice, chunks_record,
163  pain_points, blamer_bundle, &blamer_debug);
164  }
165  } // end while loop exploring alternative paths
166  FinishBlamerForSegSearch(best_choice_bundle.best_choice,
167  blamer_bundle, &blamer_debug);
168 
169  if (segsearch_debug_level > 0) {
170  tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n",
172  }
173 
174  // Clean up.
175  FreeHeapData(pain_points, MATRIX_COORD::Delete);
176  delete[] best_path_by_column;
177  delete[] pending;
178  for (row = 0; row < ratings->dimension(); ++row) {
179  for (col = 0; col <= row; ++col) {
180  BLOB_CHOICE_LIST *rating = ratings->get(col, row);
181  if (rating != NOT_CLASSIFIED) language_model_->DeleteState(rating);
182  }
183  }
184 }
185 
187  int starting_col,
188  SEG_SEARCH_PENDING_LIST *pending[],
189  BestPathByColumn *best_path_by_column[],
190  CHUNKS_RECORD *chunks_record,
191  HEAP *pain_points,
192  BestChoiceBundle *best_choice_bundle,
193  BlamerBundle *blamer_bundle) {
194  MATRIX *ratings = chunks_record->ratings;
195  for (int col = starting_col; col < ratings->dimension(); ++col) {
196  if (segsearch_debug_level > 0) {
197  tprintf("\n\nUpdateSegSearchNodes: evaluate children in col=%d\n", col);
198  }
199  // Iterate over the pending list for this column.
200  SEG_SEARCH_PENDING_LIST *pending_list = &((*pending)[col]);
201  SEG_SEARCH_PENDING_IT pending_it(pending_list);
202  GenericVector<int> non_empty_rows;
203  while (!pending_it.empty()) {
204  // Update language model state of this child+parent pair.
205  SEG_SEARCH_PENDING *p = pending_it.extract();
206  if (non_empty_rows.length() == 0 ||
207  non_empty_rows[non_empty_rows.length()-1] != p->child_row) {
208  non_empty_rows.push_back(p->child_row);
209  }
210  BLOB_CHOICE_LIST *current_node = ratings->get(col, p->child_row);
211  LanguageModelFlagsType new_changed =
213  current_node, p->parent, pain_points,
214  best_path_by_column, chunks_record,
215  best_choice_bundle, blamer_bundle);
216  if (new_changed) {
217  // Since the language model state of this entry changed, add all the
218  // pairs with it as a parent and each of its children to pending, so
219  // that the children are updated as well.
220  int child_col = p->child_row + 1;
221  for (int child_row = child_col;
222  child_row < ratings->dimension(); ++child_row) {
223  if (ratings->get(child_col, child_row) != NOT_CLASSIFIED) {
224  SEG_SEARCH_PENDING *new_pending =
225  new SEG_SEARCH_PENDING(child_row, current_node, 0);
226  SEG_SEARCH_PENDING *actual_new_pending =
227  reinterpret_cast<SEG_SEARCH_PENDING *>(
228  (*pending)[child_col].add_sorted_and_find(
229  SEG_SEARCH_PENDING::compare, true, new_pending));
230  if (new_pending != actual_new_pending) delete new_pending;
231  actual_new_pending->changed |= new_changed;
232  if (segsearch_debug_level > 0) {
233  tprintf("Added child(col=%d row=%d) parent(col=%d row=%d)"
234  " changed=0x%x to pending\n", child_col,
235  actual_new_pending->child_row,
236  col, p->child_row, actual_new_pending->changed);
237  }
238  }
239  }
240  } // end if new_changed
241  delete p; // clean up
242  pending_it.forward();
243  } // end while !pending_it.empty()
245  col, non_empty_rows, best_choice_bundle->best_choice->certainty(),
246  pain_points, best_path_by_column, chunks_record);
247  } // end for col
248 
249  if (best_choice_bundle->updated) {
251  pain_points, chunks_record, best_choice_bundle);
252  }
253 
255 }
256 
257 void Wordrec::ProcessSegSearchPainPoint(float pain_point_priority,
258  const MATRIX_COORD &pain_point,
259  const WERD_CHOICE *best_choice,
260  SEG_SEARCH_PENDING_LIST *pending[],
261  CHUNKS_RECORD *chunks_record,
262  HEAP *pain_points,
263  BlamerBundle *blamer_bundle) {
264  if (segsearch_debug_level > 0) {
265  tprintf("Classifying pain point priority=%.4f, col=%d, row=%d\n",
266  pain_point_priority, pain_point.col, pain_point.row);
267  }
268  MATRIX *ratings = chunks_record->ratings;
269  BLOB_CHOICE_LIST *classified = classify_piece(
270  chunks_record->chunks, chunks_record->word_res->denorm,
271  chunks_record->splits,
272  pain_point.col, pain_point.row, blamer_bundle);
273  ratings->put(pain_point.col, pain_point.row, classified);
274 
275  if (segsearch_debug_level > 0) {
276  print_ratings_list("Updated ratings matrix with a new entry:",
277  ratings->get(pain_point.col, pain_point.row),
278  getDict().getUnicharset());
279  ratings->print(getDict().getUnicharset());
280  }
281 
282  // Insert initial "pain points" to join the newly classified blob
283  // with its left and right neighbors.
284  if (!classified->empty()) {
285  float worst_piece_cert;
286  bool fragmented;
287  if (pain_point.col > 0) {
289  pain_point.col-1, pain_point.row, chunks_record->ratings,
290  &worst_piece_cert, &fragmented);
292  pain_point.col-1, pain_point.row, false,
294  worst_piece_cert, fragmented, best_choice->certainty(),
296  chunks_record, pain_points);
297  }
298  if (pain_point.row+1 < ratings->dimension()) {
300  pain_point.col, pain_point.row+1, chunks_record->ratings,
301  &worst_piece_cert, &fragmented);
303  pain_point.col, pain_point.row+1, true,
305  worst_piece_cert, fragmented, best_choice->certainty(),
307  chunks_record, pain_points);
308  }
309  }
310 
311  // Record a pending entry with the pain_point and each of its parents.
312  int parent_row = pain_point.col - 1;
313  if (parent_row < 0) { // this node has no parents
314  (*pending)[pain_point.col].add_sorted(
316  new SEG_SEARCH_PENDING(pain_point.row, NULL,
318  } else {
319  for (int parent_col = 0; parent_col < pain_point.col; ++parent_col) {
320  if (ratings->get(parent_col, parent_row) != NOT_CLASSIFIED) {
321  (*pending)[pain_point.col].add_sorted(
323  new SEG_SEARCH_PENDING(pain_point.row,
324  ratings->get(parent_col, parent_row),
326  }
327  }
328  }
329 }
330 
332  CHUNKS_RECORD *chunks_record,
333  HEAP *pain_points,
334  BlamerBundle *blamer_bundle,
335  STRING *blamer_debug) {
336  blamer_bundle->segsearch_is_looking_for_blame = true;
337  if (wordrec_debug_blamer) {
338  tprintf("segsearch starting to look for blame\n");
339  }
340  // Clear pain points heap.
341  int pop;
342  float pain_point_priority;
343  MATRIX_COORD *pain_point;
344  while ((pop = HeapPop(pain_points, &pain_point_priority,
345  &pain_point)) != EMPTY) {
346  delete pain_point;
347  }
348  // Fill pain points for any unclassifed blob corresponding to the
349  // correct segmentation state.
350  *blamer_debug += "Correct segmentation:\n";
351  for (int idx = 0;
352  idx < blamer_bundle->correct_segmentation_cols.length(); ++idx) {
353  blamer_debug->add_str_int(
354  "col=", blamer_bundle->correct_segmentation_cols[idx]);
355  blamer_debug->add_str_int(
356  " row=", blamer_bundle->correct_segmentation_rows[idx]);
357  *blamer_debug += "\n";
358  if (chunks_record->ratings->get(
359  blamer_bundle->correct_segmentation_cols[idx],
360  blamer_bundle->correct_segmentation_rows[idx]) == NOT_CLASSIFIED) {
362  blamer_bundle->correct_segmentation_cols[idx],
363  blamer_bundle->correct_segmentation_rows[idx],
364  false, -1.0, -1.0, false, -1.0, segsearch_max_char_wh_ratio,
365  NULL, NULL, chunks_record, pain_points)) {
366  blamer_bundle->segsearch_is_looking_for_blame = false;
367  *blamer_debug += "\nFailed to insert pain point\n";
368  blamer_bundle->SetBlame(IRR_SEGSEARCH_HEUR, *blamer_debug, best_choice,
370  break;
371  }
372  }
373  } // end for blamer_bundle->correct_segmentation_cols/rows
374 }
375 
377  BlamerBundle *blamer_bundle,
378  STRING *blamer_debug) {
379  // If we are still looking for blame (i.e. best_choice is incorrect, but a
380  // path representing the correct segmentation could be constructed), we can
381  // blame segmentation search pain point prioritization if the rating of the
382  // path corresponding to the correct segmentation is better than that of
383  // best_choice (i.e. language model would have done the correct thing, but
384  // because of poor pain point prioritization the correct segmentation was
385  // never explored). Otherwise we blame the tradeoff between the language model
386  // and the classifier, since even after exploring the path corresponding to
387  // the correct segmentation incorrect best_choice would have been chosen.
388  // One special case when we blame the classifier instead is when best choice
389  // is incorrect, but it is a dictionary word and it classifier's top choice.
390  if (blamer_bundle != NULL && blamer_bundle->segsearch_is_looking_for_blame) {
391  blamer_bundle->segsearch_is_looking_for_blame = false;
392  if (blamer_bundle->best_choice_is_dict_and_top_choice) {
393  *blamer_debug = "Best choice is: incorrect, top choice, dictionary word";
394  *blamer_debug += " with permuter ";
395  *blamer_debug += best_choice->permuter_name();
396  blamer_bundle->SetBlame(IRR_CLASSIFIER, *blamer_debug, best_choice,
398  } else if (blamer_bundle->best_correctly_segmented_rating <
399  best_choice->rating()) {
400  *blamer_debug += "Correct segmentation state was not explored";
401  blamer_bundle->SetBlame(IRR_SEGSEARCH_PP, *blamer_debug, best_choice,
403  } else {
404  if (blamer_bundle->best_correctly_segmented_rating >=
406  *blamer_debug += "Correct segmentation paths were pruned by LM\n";
407  } else {
408  char debug_buffer[256];
409  *blamer_debug += "Best correct segmentation rating ";
410  sprintf(debug_buffer, "%g",
411  blamer_bundle->best_correctly_segmented_rating);
412  *blamer_debug += debug_buffer;
413  *blamer_debug += " vs. best choice rating ";
414  sprintf(debug_buffer, "%g", best_choice->rating());
415  *blamer_debug += debug_buffer;
416  }
417  blamer_bundle->SetBlame(IRR_CLASS_LM_TRADEOFF, *blamer_debug, best_choice,
419  }
420  }
421 }
422 
423 } // namespace tesseract