20 #define __func__ __FUNCTION__
57 static int Epsilon(
int space_pix) {
58 return space_pix * 4 / 5;
68 static bool AcceptableRowArgs(
69 int debug_level,
int min_num_rows,
const char *function_name,
71 int row_start,
int row_end) {
72 if (row_start < 0 || row_end > rows->
size() || row_start > row_end) {
73 tprintf(
"Invalid arguments rows[%d, %d) while rows is of size %d.\n",
74 row_start, row_end, rows->
size());
77 if (row_end - row_start < min_num_rows) {
78 if (debug_level > 1) {
79 tprintf(
"# Too few rows[%d, %d) for %s.\n",
80 row_start, row_end, function_name);
90 static STRING StrOf(
int num) {
92 snprintf(buffer,
sizeof(buffer),
"%d", num);
101 for (
int r = 0; r < rows.
size(); r++) {
102 int num_columns = rows[r].
size();
103 for (
int c = 0; c < num_columns; c++) {
104 int num_unicodes = 0;
105 for (
int i = 0; i < rows[r][c].
size(); i++) {
106 if ((rows[r][c][i] & 0xC0) != 0x80) num_unicodes++;
108 if (c >= max_col_widths.
size()) {
111 if (num_unicodes > max_col_widths[c])
112 max_col_widths[c] = num_unicodes;
118 for (
int c = 0; c < max_col_widths.
size(); c++) {
120 STRING(
"%-") + StrOf(max_col_widths[c]) +
"s");
123 for (
int r = 0; r < rows.
size(); r++) {
124 for (
int c = 0; c < rows[r].
size(); c++) {
127 tprintf(col_width_patterns[c].
string(), rows[r][c].
string());
140 static void PrintDetectorState(
const ParagraphTheory &theory,
144 output.
back().push_back(
"#row");
145 output.
back().push_back(
"space");
146 output.
back().push_back(
"..");
147 output.
back().push_back(
"lword[widthSEL]");
148 output.
back().push_back(
"rword[widthSEL]");
150 output.
back().push_back(
"text");
152 for (
int i = 0; i < rows.
size(); i++) {
155 const RowInfo& ri = *rows[i].ri_;
157 row.
push_back(StrOf(ri.average_interword_space));
158 row.
push_back(ri.has_leaders ?
".." :
" ");
160 "[" + StrOf(ri.lword_box.width()) +
161 (ri.lword_likely_starts_idea ?
"S" :
"s") +
162 (ri.lword_likely_ends_idea ?
"E" :
"e") +
163 (ri.lword_indicates_list_item ?
"L" :
"l") +
166 "[" + StrOf(ri.rword_box.width()) +
167 (ri.rword_likely_starts_idea ?
"S" :
"s") +
168 (ri.rword_likely_ends_idea ?
"E" :
"e") +
169 (ri.rword_indicates_list_item ?
"L" :
"l") +
171 rows[i].AppendDebugInfo(theory, &row);
174 PrintTable(output,
" ");
176 tprintf(
"Active Paragraph Models:\n");
177 for (
int m = 0; m < theory.models().size(); m++) {
178 tprintf(
" %d: %s\n", m + 1, theory.models()[m]->ToString().string());
182 static void DebugDump(
185 const ParagraphTheory &theory,
190 PrintDetectorState(theory, rows);
195 int row_start,
int row_end) {
196 tprintf(
"======================================\n");
197 for (
int row = row_start; row < row_end; row++) {
198 tprintf(
"%s\n", rows[row].ri_->text.string());
200 tprintf(
"======================================\n");
206 return (ch >=
'a' && ch <=
'z') || (ch >=
'A' && ch <=
'Z');
210 return ch ==
'o' || ch ==
'O' || ch ==
'l' || ch ==
'I';
214 return strchr(
"'\"({[", ch) !=
NULL;
218 return strchr(
":'\".?!]})", ch) !=
NULL;
222 const char *
SkipChars(
const char *str,
const char *toskip) {
223 while (*str !=
'\0' && strchr(toskip, *str)) { str++; }
227 const char *
SkipChars(
const char *str,
bool (*skip)(
int)) {
228 while (*str !=
'\0' && skip(*str)) { str++; }
232 const char *
SkipOne(
const char *str,
const char *toskip) {
233 if (*str !=
'\0' && strchr(toskip, *str))
return str + 1;
241 const char *kRomans =
"ivxlmdIVXLMD";
242 const char *kDigits =
"012345789";
243 const char *kOpen =
"[{(";
244 const char *kSep =
":;-.,";
245 const char *kClose =
"]})";
247 int num_segments = 0;
248 const char *pos = word.
string();
249 while (*pos !=
'\0' && num_segments < 3) {
252 const char *numeral_end =
SkipChars(numeral_start, kRomans);
253 if (numeral_end != numeral_start) {
256 numeral_end =
SkipChars(numeral_start, kDigits);
257 if (numeral_end == numeral_start) {
260 if (numeral_end - numeral_start != 1)
268 if (pos == numeral_end)
275 const char *kListMarks =
"0Oo*.,+.";
276 return word.
size() == 1 && strchr(kListMarks, word[0]) !=
NULL;
287 if (!u || !werd || pos > werd->
length())
297 : u_(unicharset), word_(word) { wordlen_ = word->
length(); }
315 while (pos < wordlen_ && u_->get_ispunctuation(word_->
unichar_id(pos))) pos++;
326 const char *kRomans =
"ivxlmdIVXLMD";
327 while (pos < wordlen_) {
329 if (ch >= 0xF0 || strchr(kRomans, ch) == 0)
break;
336 while (pos < wordlen_ && u_->get_isalpha(word_->
unichar_id(pos))) pos++;
374 int num_segments = 0;
376 while (pos < werd->length() && num_segments < 3) {
377 int numeral_start = m.
SkipPunc(pos);
378 if (numeral_start > pos + 1)
break;
379 int numeral_end = m.
SkipRomans(numeral_start);
380 if (numeral_end == numeral_start) {
382 if (numeral_end == numeral_start) {
384 numeral_end = m.
SkipAlpha(numeral_start);
385 if (numeral_end - numeral_start != 1)
393 if (pos == numeral_end)
396 return pos == werd->
length();
408 bool *is_list,
bool *starts_idea,
bool *ends_idea) {
410 *starts_idea =
false;
417 if (unicharset && werd) {
435 int start_letter = utf8[0];
442 if (start_letter >=
'A' && start_letter <=
'Z') {
455 bool *is_list,
bool *starts_idea,
bool *ends_idea) {
457 *starts_idea =
false;
464 if (unicharset && werd) {
478 int last_letter = utf8[utf8.
size() - 1];
489 header->
push_back(
"[lmarg,lind;rind,rmarg]");
496 snprintf(s,
sizeof(s),
"[%3d,%3d;%3d,%3d]",
503 int model_numbers = 0;
504 for (
int h = 0; h < hypotheses_.
size(); h++) {
505 if (hypotheses_[h].model ==
NULL)
507 if (model_numbers > 0)
510 model_string += StrOf(1 + theory.
IndexOf(hypotheses_[h].model));
511 }
else if (hypotheses_[h].model ==
kCrownLeft) {
512 model_string +=
"CrL";
514 model_string +=
"CrR";
518 if (model_numbers == 0)
533 if (hypotheses_.
empty())
535 bool has_start =
false;
536 bool has_body =
false;
537 for (
int i = 0; i < hypotheses_.
size(); i++) {
538 switch (hypotheses_[i].ty) {
539 case LT_START: has_start =
true;
break;
540 case LT_BODY: has_body =
true;
break;
542 tprintf(
"Encountered bad value in hypothesis list: %c\n",
547 if (has_start && has_body)
553 if (hypotheses_.
empty())
555 bool has_start =
false;
556 bool has_body =
false;
557 for (
int i = 0; i < hypotheses_.
size(); i++) {
558 if (hypotheses_[i].model != model)
560 switch (hypotheses_[i].ty) {
561 case LT_START: has_start =
true;
break;
562 case LT_BODY: has_body =
true;
break;
564 tprintf(
"Encountered bad value in hypothesis list: %c\n",
569 if (has_start && has_body)
577 tprintf(
"Trying to set a line to be START when it's already BODY.\n");
587 tprintf(
"Trying to set a line to be BODY when it's already START.\n");
598 hypotheses_.
remove(old_idx);
605 hypotheses_.
remove(old_idx);
609 for (
int h = 0; h < hypotheses_.
size(); h++) {
616 for (
int h = 0; h < hypotheses_.
size(); h++) {
623 for (
int h = 0; h < hypotheses_.
size(); h++) {
624 if (hypotheses_[h].model !=
NULL)
630 if (hypotheses_.
size() != 1 || hypotheses_[0].ty !=
LT_START)
632 return hypotheses_[0].model;
636 if (hypotheses_.
size() != 1 || hypotheses_[0].ty !=
LT_BODY)
638 return hypotheses_[0].model;
646 for (
int h = hypotheses_.
size() - 1; h >= 0; h--) {
647 if (!models.
contains(hypotheses_[h].model)) {
666 : max_cluster_width_(max_cluster_width) {}
672 int max_cluster_width_;
679 for (
int i = 0; i < clusters.
size(); i++) {
680 if (abs(value - clusters[i].center) <
681 abs(value - clusters[best_index].center))
690 for (
int i = 0; i < values_.
size();) {
694 while (++i < values_.
size() && values_[i] <= lo + max_cluster_width_) {
704 int row_start,
int row_end,
708 if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
715 for (
int i = row_start; i < row_end; i++) {
716 initial_lefts.
Add((*rows)[i].lindent_);
717 initial_rights.
Add((*rows)[i].rindent_);
730 int infrequent_enough_to_ignore = (row_end - row_start) /
kStrayLinePer;
731 for (
int i = row_start; i < row_end; i++) {
732 int lidx =
ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
733 int ridx =
ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
734 if (initial_left_tabs[lidx].
count > infrequent_enough_to_ignore ||
735 initial_right_tabs[ridx].
count > infrequent_enough_to_ignore) {
736 lefts.
Add((*rows)[i].lindent_);
737 rights.
Add((*rows)[i].rindent_);
764 int row_start,
int row_end,
768 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
770 for (
int row = row_start; row < row_end; row++) {
773 if (valid_first && !valid_body) {
774 (*rows)[row].AddStartLine(model);
775 }
else if (valid_body && !valid_first) {
776 (*rows)[row].AddBodyLine(model);
777 }
else if (valid_body && valid_first) {
778 bool after_eop = (row == row_start);
779 if (row > row_start) {
780 if (eop_threshold > 0) {
782 after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
784 after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
792 (*rows)[row].AddStartLine(model);
794 (*rows)[row].AddBodyLine(model);
813 int r_start,
int r_end)
814 : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end),
819 ltr = (*r)[r_start].ri_->ltr;
863 (*rows)[row_a], (*rows)[row_b],
just);
866 void PrintRows()
const { PrintRowRange(*rows, row_start, row_end); }
868 void Fail(
int min_debug_level,
const char *why)
const {
869 if (debug_level < min_debug_level)
return;
941 int num_full_rows = 0;
942 int last_row_full = 0;
946 if (i == s.
row_end - 1) last_row_full++;
950 if (num_full_rows < 0.7 * num_rows) {
951 s.
Fail(1,
"Not enough full lines to know which lines start paras.");
964 if (debug_level > 0) {
965 tprintf(
"# Not enough variety for clear outline classification. "
966 "Guessing these are %s aligned based on script.\n",
967 s.
ltr ?
"left" :
"right");
975 if (num_rows - 1 == num_full_rows - last_row_full) {
980 (*s.
rows)[i].AddBodyLine(model);
1030 int row_start,
int row_end,
1032 if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
1034 if (debug_level > 1) {
1035 tprintf(
"###############################################\n");
1036 tprintf(
"##### GeometricClassify( rows[%d:%d) ) ####\n",
1037 row_start, row_end);
1038 tprintf(
"###############################################\n");
1044 s.
Fail(2,
"Too much variety for simple outline classification.");
1048 s.
Fail(1,
"Not enough variety for simple outline classification.");
1077 int firsts[2] = {0, 0};
1095 int percent0firsts, percent1firsts;
1096 percent0firsts = (100 * firsts[0]) / s.
AlignTabs()[0].count;
1097 percent1firsts = (100 * firsts[1]) / s.
AlignTabs()[1].count;
1100 if ((percent0firsts < 20 && 30 < percent1firsts) ||
1101 percent0firsts + 30 < percent1firsts) {
1104 }
else if ((percent1firsts < 20 && 30 < percent0firsts) ||
1105 percent1firsts + 30 < percent0firsts) {
1110 if (debug_level > 1) {
1111 tprintf(
"# Cannot determine %s indent likely to start paragraphs.\n",
1113 tprintf(
"# Indent of %d looks like a first line %d%% of the time.\n",
1114 s.
AlignTabs()[0].center, percent0firsts);
1115 tprintf(
"# Indent of %d looks like a first line %d%% of the time.\n",
1116 s.
AlignTabs()[1].center, percent1firsts);
1164 for (
int i = 0; i < models_->
size(); i++) {
1165 if ((*models_)[i]->Comparable(model))
1166 return (*models_)[i];
1175 for (
int i = models_->
size() - 1; i >= 0; i--) {
1190 for (
int m = 0; m < models_->
size(); m++) {
1200 for (
int m = 0; m < models_->
size(); m++) {
1208 for (
int i = 0; i < models_->
size(); i++) {
1209 if ((*models_)[i] == model)
1218 tprintf(
"ValidFirstLine() should only be called with strong models!\n");
1222 (*rows)[row].lmargin_, (*rows)[row].lindent_,
1223 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1229 tprintf(
"ValidBodyLine() should only be called with strong models!\n");
1233 (*rows)[row].lmargin_, (*rows)[row].lindent_,
1234 (*rows)[row].rindent_, (*rows)[row].rmargin_);
1240 tprintf(
"CrownCompatible() should only be called with crown models!\n");
1261 : theory_(theory), rows_(rows), row_start_(row_start),
1263 if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
1269 for (
int row = row_start - 1; row <= row_end; row++) {
1275 void ParagraphModelSmearer::CalculateOpenModels(
int row_start,
int row_end) {
1277 if (row_start < row_start_) row_start = row_start_;
1278 if (row_end > row_end_) row_end = row_end_;
1280 for (
int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end;
1282 if ((*rows_)[row].ri_->num_words == 0) {
1283 OpenModels(row + 1) = no_models;
1286 (*rows_)[row].StartHypotheses(&opened);
1290 for (
int m = 0; m < opened.size(); m++) {
1296 still_open.push_back_new(opened[m]);
1299 OpenModels(row + 1) = still_open;
1306 CalculateOpenModels(row_start_, row_end_);
1311 for (
int i = row_start_; i < row_end_; i++) {
1320 bool left_align_open =
false;
1321 bool right_align_open =
false;
1322 for (
int m = 0; m < OpenModels(i).
size(); m++) {
1323 switch (OpenModels(i)[m]->justification()) {
1326 default: left_align_open = right_align_open =
true;
1334 likely_start =
true;
1336 if ((left_align_open && right_align_open) ||
1337 (!left_align_open && !right_align_open)) {
1342 }
else if (left_align_open) {
1357 for (
int m = 0; m < OpenModels(i).
size(); m++) {
1366 (*rows_)[i - 1].StrongHypotheses(&last_line_models);
1370 for (
int m = 0; m < last_line_models.
size(); m++) {
1385 for (
int m = 0; m < all_models.
size(); m++) {
1395 CalculateOpenModels(i + 1, row_end_);
1407 for (
int i = 0; i < rows.
size(); i++) {
1408 rows[i].StrongHypotheses(&used_models);
1441 for (
int end = rows->
size(); end > 0; end = start) {
1445 (model = (*rows)[end - 1].UniqueBodyHypothesis()) ==
NULL) {
1448 if (end == 0)
break;
1450 while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
1453 if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
1479 (*rows)[start].SetUnknown();
1480 (*rows)[start].AddStartLine(crown_model);
1481 for (
int row = start + 1; row < end; row++) {
1482 (*rows)[row].SetUnknown();
1483 (*rows)[row].AddBodyLine(crown_model);
1510 if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
1513 int lmin, lmax, rmin, rmax;
1514 lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
1515 rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
1516 for (
int i = start; i < end; i++) {
1524 STATS lefts(lmin, lmax + 1);
1525 STATS rights(rmin, rmax + 1);
1526 for (
int i = start; i < end; i++) {
1533 int ignorable_left = lefts.
ile(
ClipToRange(percentile, 0, 100) / 100.0);
1534 int ignorable_right = rights.
ile(
ClipToRange(percentile, 0, 100) / 100.0);
1535 for (
int i = start; i < end; i++) {
1537 int ldelta = ignorable_left - sr.
lmargin_;
1540 int rdelta = ignorable_right - sr.
rmargin_;
1548 int row_start,
int row_end) {
1549 if (row_end < row_start + 1)
return 1;
1551 int natural_space = rows[row_start].ri_->average_interword_space;
1552 for (
int i = row_start; i < row_end; i++) {
1553 if (rows[i].ri_->num_words > 1) {
1555 natural_space = rows[i].ri_->average_interword_space;
1558 if (rows[i].ri_->average_interword_space < natural_space)
1559 natural_space = rows[i].ri_->average_interword_space;
1563 return natural_space;
1575 tprintf(
"Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
1577 int available_space;
1598 int available_space = before.
lindent_;
1599 if (before.
rindent_ > available_space)
1641 int start,
int end,
int tolerance,
bool *consistent) {
1642 int ltr_line_count = 0;
1643 for (
int i = start; i < end; i++) {
1644 ltr_line_count +=
static_cast<int>((*rows)[i].ri_->ltr);
1646 bool ltr = (ltr_line_count >= (end - start) / 2);
1649 if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
1654 int lmargin = (*rows)[start].lmargin_;
1655 int rmargin = (*rows)[start].rmargin_;
1656 int lmin, lmax, rmin, rmax, cmin, cmax;
1657 lmin = lmax = (*rows)[start + 1].lindent_;
1658 rmin = rmax = (*rows)[start + 1].rindent_;
1660 for (
int i = start + 1; i < end; i++) {
1661 if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
1662 tprintf(
"Margins don't match! Software error.\n");
1663 *consistent =
false;
1668 UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
1670 int ldiff = lmax - lmin;
1671 int rdiff = rmax - rmin;
1672 int cdiff = cmax - cmin;
1673 if (rdiff > tolerance && ldiff > tolerance) {
1674 if (cdiff < tolerance * 2) {
1675 if (end - start < 3)
1679 *consistent =
false;
1682 if (end - start < 3)
1687 bool body_admits_left_alignment = ldiff < tolerance;
1688 bool body_admits_right_alignment = rdiff < tolerance;
1692 (lmin + lmax) / 2, tolerance);
1695 (rmin + rmax) / 2, tolerance);
1699 bool text_admits_left_alignment = ltr || left_model.
is_flush();
1700 bool text_admits_right_alignment = !ltr || right_model.
is_flush();
1705 if (tolerance < rdiff) {
1706 if (body_admits_left_alignment && text_admits_left_alignment)
1708 *consistent =
false;
1711 if (tolerance < ldiff) {
1712 if (body_admits_right_alignment && text_admits_right_alignment)
1714 *consistent =
false;
1722 int first_left = (*rows)[start].lindent_;
1723 int first_right = (*rows)[start].rindent_;
1725 if (ltr && body_admits_left_alignment &&
1726 (first_left < lmin || first_left > lmax))
1728 if (!ltr && body_admits_right_alignment &&
1729 (first_right < rmin || first_right > rmax))
1732 *consistent =
false;
1743 int start,
int end,
int tolerance) {
1744 bool unused_consistent;
1746 rows, start, end, tolerance, &unused_consistent);
1748 tprintf(
"Could not determine a model for this paragraph:\n");
1749 PrintRowRange(*rows, start, end);
1757 if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
1760 for (
int i = start + 1 ; i < end; i++) {
1778 int row_start,
int row_end) {
1780 for (
int i = row_start + 1; i < row_end; i++) {
1818 for (
int i = row_start + 1; i < row_end - 1; i++) {
1849 int row_start,
int row_end,
1850 bool allow_flush_models,
1852 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
1855 int start = row_start;
1856 while (start < row_end) {
1857 while (start < row_end && (*rows)[start].GetLineType() !=
LT_START)
1859 if (start >= row_end - 1)
1862 int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
1865 bool next_consistent;
1871 if (end < row_end - 1) {
1874 next_consistent = lt ==
LT_BODY ||
1878 next_consistent =
false;
1880 if (next_consistent) {
1882 rows, start, end + 1, tolerance, &next_consistent);
1883 if (((*rows)[start].ri_->ltr &&
1886 (!(*rows)[start].ri_->ltr &&
1889 next_consistent =
false;
1891 last_model = next_model;
1893 next_consistent =
false;
1895 }
while (next_consistent && end < row_end);
1899 if (end > start + 1) {
1903 debug_level, rows, start, end,
1908 if (end == start + 2) {
1911 }
else if (start == row_start) {
1918 }
else if (allow_flush_models) {
1919 model = theory->
AddModel(new_model);
1922 model = theory->
AddModel(new_model);
1925 (*rows)[start].AddStartLine(model);
1926 for (
int i = start + 1; i < end; i++) {
1927 (*rows)[i].AddBodyLine(model);
1944 int row_start,
int row_end,
1946 if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
1949 if (debug_level > 1) {
1950 tprintf(
"#############################################\n");
1951 tprintf(
"# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
1952 tprintf(
"#############################################\n");
1958 DebugDump(debug_level > 2,
"Initial strong signals.", *theory, *rows);
1963 DebugDump(debug_level > 2,
"Unsmeared hypotheses.s.", *theory, *rows);
1973 int row_start,
int row_end,
1975 for (
int i = row_start + 1; i < row_end - 1; i++) {
1976 if ((*rows)[i - 1].ri_->has_leaders &&
1977 (*rows)[i].ri_->has_leaders &&
1978 (*rows)[i + 1].ri_->has_leaders) {
1981 (*rows)[i].AddStartLine(model);
1993 int end = rows.
size();
1995 for (; end > 0; end = start) {
1999 bool single_line_paragraph =
false;
2001 rows[start].NonNullHypotheses(&models);
2002 if (models.
size() > 0) {
2004 if (rows[start].GetLineType(model) !=
LT_BODY)
2005 single_line_paragraph =
true;
2007 if (model && !single_line_paragraph) {
2009 while (--start > 0 && rows[start].GetLineType(model) ==
LT_BODY) {
2012 if (start < 0 || rows[start].GetLineType(model) !=
LT_START) {
2016 if (model ==
NULL) {
2026 for (
int row = end; row < rows.
size(); row++) {
2027 if ((*row_owners)[row] &&
2031 model = (*row_owners)[row]->model;
2039 0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2044 0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2047 rows[start].SetUnknown();
2048 rows[start].AddStartLine(model);
2049 for (
int i = start + 1; i < end; i++) {
2050 rows[i].SetUnknown();
2051 rows[i].AddBodyLine(model);
2057 ? rows[start].ri_->rword_indicates_list_item
2058 : rows[start].ri_->lword_indicates_list_item;
2059 for (
int row = start; row < end; row++) {
2060 if ((*row_owners)[row] !=
NULL) {
2061 tprintf(
"Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
2062 "more than once!\n");
2064 (*row_owners)[row] = p;
2088 rows[row].StrongHypotheses(&row_models);
2090 for (
int m = 0; m < row_models.
size(); m++) {
2091 bool all_starts = rows[row].GetLineType();
2093 bool continues =
true;
2094 for (
int i = row - 1; i >= 0 && continues; i--) {
2096 rows[i].NonNullHypotheses(&models);
2097 switch (rows[i].GetLineType(row_models[m])) {
2098 case LT_START: run_length++;
break;
2100 case LT_BODY: run_length++; all_starts =
false;
break;
2102 default: continues =
false;
2106 for (
int i = row + 1; i < rows.
size() && continues; i++) {
2108 rows[i].NonNullHypotheses(&models);
2109 switch (rows[i].GetLineType(row_models[m])) {
2110 case LT_START: run_length++;
break;
2112 case LT_BODY: run_length++; all_starts =
false;
break;
2114 default: continues =
false;
2117 if (run_length > 2 || (!all_starts && run_length > 1))
return false;
2130 int row_start,
int row_end) {
2132 for (
int i = row_start; i < row_end; i++) {
2133 bool needs_fixing =
false;
2137 rows[i].StrongHypotheses(&models);
2138 rows[i].NonNullHypotheses(&models_w_crowns);
2139 if (models.
empty() && models_w_crowns.
size() > 0) {
2141 for (
int end = i + 1; end < rows.
size(); end++) {
2144 rows[end].NonNullHypotheses(&end_models);
2145 rows[end].StrongHypotheses(&strong_end_models);
2146 if (end_models.
size() == 0) {
2147 needs_fixing =
true;
2149 }
else if (strong_end_models.
size() > 0) {
2150 needs_fixing =
false;
2154 }
else if (models.
empty() && rows[i].ri_->num_words > 0) {
2156 needs_fixing =
true;
2159 if (!needs_fixing && !models.
empty()) {
2171 for (
int i = 0; i < to_fix->
size(); i++) {
2172 (*to_fix)[i].end = (*to_fix)[i].end + 1;
2181 PARA_LIST *paragraphs) {
2183 paragraphs->
clear();
2184 PARA_IT out(paragraphs);
2186 for (
int i = 0; i < rows.
size(); i++) {
2187 if (rows[i] ==
NULL) {
2188 if (i == 0 || rows[i - 1] != formerly_null) {
2189 rows[i] = formerly_null =
new PARA();
2191 rows[i] = formerly_null;
2194 }
else if (i > 0 && rows[i - 1] == rows[i]) {
2197 out.add_after_then_move(rows[i]);
2214 PARA_LIST *paragraphs,
2224 for (
int i = 0; i < row_infos->
size(); i++) {
2225 rows[i].Init((*row_infos)[i]);
2235 DebugDump(debug_level > 1,
"End of Pass 1", theory, rows);
2239 for (
int i = 0; i < leftovers.
size(); i++) {
2246 leftovers[i].begin, leftovers[i].end, &theory);
2254 bool pass2a_was_useful = leftovers2.
size() > 1 ||
2255 (leftovers2.
size() == 1 &&
2256 (leftovers2[0].begin != 0 || leftovers2[0].end != rows.
size()));
2257 if (pass2a_was_useful) {
2258 for (
int j = 0; j < leftovers2.
size(); j++) {
2260 leftovers2[j].begin, leftovers2[j].end,
2266 DebugDump(debug_level > 1,
"End of Pass 2", theory, rows);
2273 for (
int i = 0; i < leftovers.
size(); i++) {
2275 leftovers[i].begin, leftovers[i].end, &theory);
2280 DebugDump(debug_level > 1,
"End of Pass 3", theory, rows);
2285 for (
int i = 0; i < leftovers.
size(); i++) {
2286 for (
int j = leftovers[i].begin; j < leftovers[i].end; j++) {
2287 rows[j].SetUnknown();
2291 DebugDump(debug_level > 1,
"End of Pass 4", theory, rows);
2297 DebugDump(debug_level > 0,
"Final Paragraph Segmentation", theory, rows);
2309 PageIterator pit(static_cast<const PageIterator&>(it));
2310 bool first_word =
true;
2325 if (fake_text.
size() == 0)
return;
2328 for (
int i = 0; i < lspaces; i++) {
2331 info->
text += fake_text;
2343 if (!lword) lword = word_res;
2344 if (rword != word_res) info->
num_words++;
2347 word_res = page_res_it.
forward();
2348 }
while (page_res_it.
row() == this_row);
2388 if (!after_recognition) {
2394 int trailing_ws_idx = strlen(text);
2395 while (trailing_ws_idx > 0 &&
2397 ((text[trailing_ws_idx - 1] & 0x80) == 0) &&
2398 isspace(text[trailing_ws_idx - 1]))
2400 if (trailing_ws_idx > 0) {
2402 for (
int i = 0; i < lspaces; i++)
2404 for (
int i = 0; i < trailing_ws_idx; i++)
2405 info->
text += text[i];
2417 int num_leaders = 0;
2427 word_res = page_res_it.
forward();
2428 }
while (page_res_it.
row() == this_row);
2429 info->
ltr = ltr >= rtl;
2432 if (werds.
size() > 0) {
2433 WERD_RES *lword = werds[0], *rword = werds[werds.
size() - 1];
2437 info->
rword_box = rword->word->bounding_box();
2455 bool after_text_recognition,
2485 if (row_infos.
size() > 0) {
2486 int min_lmargin = row_infos[0].pix_ldistance;
2487 int min_rmargin = row_infos[0].pix_rdistance;
2488 for (
int i = 1; i < row_infos.
size(); i++) {
2489 if (row_infos[i].pix_ldistance < min_lmargin)
2490 min_lmargin = row_infos[i].pix_ldistance;
2491 if (row_infos[i].pix_rdistance < min_rmargin)
2492 min_rmargin = row_infos[i].pix_rdistance;
2494 if (min_lmargin > 0 || min_rmargin > 0) {
2495 for (
int i = 0; i < row_infos.
size(); i++) {
2496 row_infos[i].pix_ldistance -= min_lmargin;
2497 row_infos[i].pix_rdistance -= min_rmargin;
2505 if (!is_image_block) {
2515 for (
int i = 0; i < row_owners.
size(); i++) {