tesseract  4.1.1
pieces.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: pieces.cpp (Formerly pieces.c)
5  * Description:
6  * Author: Mark Seaman, OCR Technology
7  *
8  * (c) Copyright 1987, Hewlett-Packard Company.
9  ** Licensed under the Apache License, Version 2.0 (the "License");
10  ** you may not use this file except in compliance with the License.
11  ** You may obtain a copy of the License at
12  ** http://www.apache.org/licenses/LICENSE-2.0
13  ** Unless required by applicable law or agreed to in writing, software
14  ** distributed under the License is distributed on an "AS IS" BASIS,
15  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  ** See the License for the specific language governing permissions and
17  ** limitations under the License.
18  *
19  *********************************************************************************/
20 /*----------------------------------------------------------------------
21  I n c l u d e s
22 ----------------------------------------------------------------------*/
23 
24 #include "blobs.h"
25 #include "helpers.h"
26 #include "matrix.h"
27 #include "ratngs.h"
28 #include "seam.h"
29 #include "wordrec.h"
30 
31 // Include automatically generated configuration file if running autoconf.
32 #ifdef HAVE_CONFIG_H
33 #include "config_auto.h"
34 #endif
35 
37 
38 /*----------------------------------------------------------------------
39  F u n c t i o n s
40 ----------------------------------------------------------------------*/
41 
42 /**********************************************************************
43  * classify_piece
44  *
45  * Create a larger piece from a collection of smaller ones. Classify
46  * it and return the results. Take the large piece apart to leave
47  * the collection of small pieces un modified.
48  **********************************************************************/
49 namespace tesseract {
50 BLOB_CHOICE_LIST *Wordrec::classify_piece(const GenericVector<SEAM*>& seams,
51  int16_t start,
52  int16_t end,
53  const char* description,
54  TWERD *word,
55  BlamerBundle *blamer_bundle) {
56  if (end > start) SEAM::JoinPieces(seams, word->blobs, start, end);
57  BLOB_CHOICE_LIST *choices = classify_blob(word->blobs[start], description,
58  White, blamer_bundle);
59  // Set the matrix_cell_ entries in all the BLOB_CHOICES.
60  BLOB_CHOICE_IT bc_it(choices);
61  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
62  bc_it.data()->set_matrix_cell(start, end);
63  }
64 
65  if (end > start) SEAM::BreakPieces(seams, word->blobs, start, end);
66 
67  return (choices);
68 }
69 
70 template<class BLOB_CHOICE>
71 int SortByUnicharID(const void *void1, const void *void2) {
72  const BLOB_CHOICE *p1 = *static_cast<const BLOB_CHOICE *const *>(void1);
73  const BLOB_CHOICE *p2 = *static_cast<const BLOB_CHOICE *const *>(void2);
74 
75  return p1->unichar_id() - p2->unichar_id();
76 }
77 
78 template<class BLOB_CHOICE>
79 int SortByRating(const void *void1, const void *void2) {
80  const BLOB_CHOICE *p1 = *static_cast<const BLOB_CHOICE *const *>(void1);
81  const BLOB_CHOICE *p2 = *static_cast<const BLOB_CHOICE *const *>(void2);
82 
83  if (p1->rating() < p2->rating())
84  return 1;
85  return -1;
86 }
87 
88 
89 /**********************************************************************
90  * fill_filtered_fragment_list
91  *
92  * Filter the fragment list so that the filtered_choices only contain
93  * fragments that are in the correct position. choices is the list
94  * that we are going to filter. fragment_pos is the position in the
95  * fragment that we are looking for and num_frag_parts is the the
96  * total number of pieces. The result will be appended to
97  * filtered_choices.
98  **********************************************************************/
99 void Wordrec::fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices,
100  int fragment_pos,
101  int num_frag_parts,
102  BLOB_CHOICE_LIST *filtered_choices) {
103  BLOB_CHOICE_IT filtered_choices_it(filtered_choices);
104  BLOB_CHOICE_IT choices_it(choices);
105 
106  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
107  choices_it.forward()) {
108  UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
109  const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id);
110 
111  if (frag != nullptr && frag->get_pos() == fragment_pos &&
112  frag->get_total() == num_frag_parts) {
113  // Recover the unichar_id of the unichar that this fragment is
114  // a part of
115  auto *b = new BLOB_CHOICE(*choices_it.data());
116  int original_unichar = unicharset.unichar_to_id(frag->get_unichar());
117  b->set_unichar_id(original_unichar);
118  filtered_choices_it.add_to_end(b);
119  }
120  }
121 
122  filtered_choices->sort(SortByUnicharID<BLOB_CHOICE>);
123 }
124 
125 
126 /**********************************************************************
127  * merge_and_put_fragment_lists
128  *
129  * Merge the fragment lists in choice_lists and append it to the
130  * ratings matrix.
131  **********************************************************************/
132 void Wordrec::merge_and_put_fragment_lists(int16_t row, int16_t column,
133  int16_t num_frag_parts,
134  BLOB_CHOICE_LIST *choice_lists,
135  MATRIX *ratings) {
136  auto *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts];
137 
138  for (int i = 0; i < num_frag_parts; i++) {
139  choice_lists_it[i].set_to_list(&choice_lists[i]);
140  choice_lists_it[i].mark_cycle_pt();
141  }
142 
143  BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column);
144  if (merged_choice == nullptr)
145  merged_choice = new BLOB_CHOICE_LIST;
146 
147  bool end_of_list = false;
148  BLOB_CHOICE_IT merged_choice_it(merged_choice);
149  while (!end_of_list) {
150  // Find the maximum unichar_id of the current entry the iterators
151  // are pointing at
152  UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id();
153  for (int i = 0; i < num_frag_parts; i++) {
154  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
155  if (max_unichar_id < unichar_id) {
156  max_unichar_id = unichar_id;
157  }
158  }
159 
160  // Move the each iterators until it gets to an entry that has a
161  // value greater than or equal to max_unichar_id
162  for (int i = 0; i < num_frag_parts; i++) {
163  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
164  while (!choice_lists_it[i].cycled_list() &&
165  unichar_id < max_unichar_id) {
166  choice_lists_it[i].forward();
167  unichar_id = choice_lists_it[i].data()->unichar_id();
168  }
169  if (choice_lists_it[i].cycled_list()) {
170  end_of_list = true;
171  break;
172  }
173  }
174 
175  if (end_of_list)
176  break;
177 
178  // Checks if the fragments are parts of the same character
179  UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id();
180  bool same_unichar = true;
181  for (int i = 1; i < num_frag_parts; i++) {
182  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
183  if (unichar_id != first_unichar_id) {
184  same_unichar = false;
185  break;
186  }
187  }
188 
189  if (same_unichar) {
190  // Add the merged character to the result
191  UNICHAR_ID merged_unichar_id = first_unichar_id;
192  GenericVector<ScoredFont> merged_fonts =
193  choice_lists_it[0].data()->fonts();
194  float merged_min_xheight = choice_lists_it[0].data()->min_xheight();
195  float merged_max_xheight = choice_lists_it[0].data()->max_xheight();
196  float positive_yshift = 0, negative_yshift = 0;
197  int merged_script_id = choice_lists_it[0].data()->script_id();
198  BlobChoiceClassifier classifier = choice_lists_it[0].data()->classifier();
199 
200  float merged_rating = 0, merged_certainty = 0;
201  for (int i = 0; i < num_frag_parts; i++) {
202  float rating = choice_lists_it[i].data()->rating();
203  float certainty = choice_lists_it[i].data()->certainty();
204 
205  if (i == 0 || certainty < merged_certainty)
206  merged_certainty = certainty;
207  merged_rating += rating;
208 
209  choice_lists_it[i].forward();
210  if (choice_lists_it[i].cycled_list())
211  end_of_list = true;
212  IntersectRange(choice_lists_it[i].data()->min_xheight(),
213  choice_lists_it[i].data()->max_xheight(),
214  &merged_min_xheight, &merged_max_xheight);
215  float yshift = choice_lists_it[i].data()->yshift();
216  if (yshift > positive_yshift) positive_yshift = yshift;
217  if (yshift < negative_yshift) negative_yshift = yshift;
218  // Use the min font rating over the parts.
219  // TODO(rays) font lists are unsorted. Need to be faster?
220  const GenericVector<ScoredFont>& frag_fonts =
221  choice_lists_it[i].data()->fonts();
222  for (int f = 0; f < frag_fonts.size(); ++f) {
223  int merged_f = 0;
224  for (merged_f = 0; merged_f < merged_fonts.size() &&
225  merged_fonts[merged_f].fontinfo_id != frag_fonts[f].fontinfo_id;
226  ++merged_f) {}
227  if (merged_f == merged_fonts.size()) {
228  merged_fonts.push_back(frag_fonts[f]);
229  } else if (merged_fonts[merged_f].score > frag_fonts[f].score) {
230  merged_fonts[merged_f].score = frag_fonts[f].score;
231  }
232  }
233  }
234 
235  float merged_yshift = positive_yshift != 0
236  ? (negative_yshift != 0 ? 0 : positive_yshift)
237  : negative_yshift;
238  auto* choice = new BLOB_CHOICE(merged_unichar_id,
239  merged_rating,
240  merged_certainty,
241  merged_script_id,
242  merged_min_xheight,
243  merged_max_xheight,
244  merged_yshift,
245  classifier);
246  choice->set_fonts(merged_fonts);
247  merged_choice_it.add_to_end(choice);
248  }
249  }
250 
252  print_ratings_list("Merged Fragments", merged_choice,
253  unicharset);
254 
255  if (merged_choice->empty())
256  delete merged_choice;
257  else
258  ratings->put(row, column, merged_choice);
259 
260  delete [] choice_lists_it;
261 }
262 
263 /**********************************************************************
264  * get_fragment_lists
265  *
266  * Recursively go through the ratings matrix to find lists of fragments
267  * to be merged in the function merge_and_put_fragment_lists.
268  * current_frag is the position of the piece we are looking for.
269  * current_row is the row in the rating matrix we are currently at.
270  * start is the row we started initially, so that we can know where
271  * to append the results to the matrix. num_frag_parts is the total
272  * number of pieces we are looking for and num_blobs is the size of the
273  * ratings matrix.
274  **********************************************************************/
275 void Wordrec::get_fragment_lists(int16_t current_frag, int16_t current_row,
276  int16_t start, int16_t num_frag_parts,
277  int16_t num_blobs, MATRIX *ratings,
278  BLOB_CHOICE_LIST *choice_lists) {
279  if (current_frag == num_frag_parts) {
280  merge_and_put_fragment_lists(start, current_row - 1, num_frag_parts,
281  choice_lists, ratings);
282  return;
283  }
284 
285  for (int16_t x = current_row; x < num_blobs; x++) {
286  BLOB_CHOICE_LIST *choices = ratings->get(current_row, x);
287  if (choices == nullptr)
288  continue;
289 
290  fill_filtered_fragment_list(choices, current_frag, num_frag_parts,
291  &choice_lists[current_frag]);
292  if (!choice_lists[current_frag].empty()) {
293  get_fragment_lists(current_frag + 1, x + 1, start, num_frag_parts,
294  num_blobs, ratings, choice_lists);
295  choice_lists[current_frag].clear();
296  }
297  }
298 }
299 
300 
301 /**********************************************************************
302  * merge_fragments
303  *
304  * Try to merge fragments in the ratings matrix and put the result in
305  * the corresponding row and column
306  **********************************************************************/
307 void Wordrec::merge_fragments(MATRIX *ratings, int16_t num_blobs) {
308  BLOB_CHOICE_LIST choice_lists[CHAR_FRAGMENT::kMaxChunks];
309  for (int16_t start = 0; start < num_blobs; start++) {
310  for (int frag_parts = 2; frag_parts <= CHAR_FRAGMENT::kMaxChunks;
311  frag_parts++) {
312  get_fragment_lists(0, start, start, frag_parts, num_blobs,
313  ratings, choice_lists);
314  }
315  }
316 
317  // Delete fragments from the rating matrix
318  for (int16_t x = 0; x < num_blobs; x++) {
319  for (int16_t y = x; y < num_blobs; y++) {
320  BLOB_CHOICE_LIST *choices = ratings->get(x, y);
321  if (choices != nullptr) {
322  BLOB_CHOICE_IT choices_it(choices);
323  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
324  choices_it.forward()) {
325  UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
326  const CHAR_FRAGMENT *frag =
327  unicharset.get_fragment(choice_unichar_id);
328  if (frag != nullptr)
329  delete choices_it.extract();
330  }
331  }
332  }
333  }
334 }
335 
336 
337 } // namespace tesseract
tesseract::Wordrec::merge_fragments
void merge_fragments(MATRIX *ratings, int16_t num_blobs)
Definition: pieces.cpp:313
tesseract::Wordrec::fill_filtered_fragment_list
void fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
Definition: pieces.cpp:105
BLOB_CHOICE::unichar_id
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
BLOB_CHOICE
Definition: ratngs.h:52
TWERD
Definition: blobs.h:418
CHAR_FRAGMENT::get_total
int get_total() const
Definition: unicharset.h:72
tesseract::Wordrec::classify_piece
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:56
tesseract::Classify::classify_debug_level
int classify_debug_level
Definition: classify.h:430
CHAR_FRAGMENT::get_unichar
const char * get_unichar() const
Definition: unicharset.h:70
wordrec.h
ratngs.h
tesseract::SortByRating
int SortByRating(const void *void1, const void *void2)
Definition: pieces.cpp:85
tesseract
Definition: altorenderer.cpp:25
BLOB_CHOICE::rating
float rating() const
Definition: ratngs.h:80
SEAM::BreakPieces
static void BreakPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:188
TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
GenericVector< SEAM * >
tesseract::Wordrec::get_fragment_lists
void get_fragment_lists(int16_t current_frag, int16_t current_row, int16_t start, int16_t num_frag_parts, int16_t num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
Definition: pieces.cpp:281
IntersectRange
void IntersectRange(const T &lower1, const T &upper1, T *lower2, T *upper2)
Definition: helpers.h:145
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:34
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
matrix.h
SEAM::JoinPieces
static void JoinPieces(const GenericVector< SEAM * > &seams, const GenericVector< TBLOB * > &blobs, int first, int last)
Definition: seam.cpp:210
BlamerBundle
Definition: blamer.h:102
tesseract::Wordrec::classify_blob
BLOB_CHOICE_LIST * classify_blob(TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
Definition: wordclass.cpp:54
helpers.h
tesseract::CCUtil::unicharset
UNICHARSET unicharset
Definition: ccutil.h:73
GENERIC_2D_ARRAY::put
void put(ICOORD pos, const T &thing)
Definition: matrix.h:223
tesseract::ScoredFont
Definition: fontinfo.h:38
White
@ White
Definition: callcpp.h:29
CHAR_FRAGMENT::kMaxChunks
static const int kMaxChunks
Definition: unicharset.h:55
blobs.h
tesseract::SortByUnicharID
int SortByUnicharID(const void *void1, const void *void2)
Definition: pieces.cpp:77
BlobChoiceClassifier
BlobChoiceClassifier
Definition: ratngs.h:43
UNICHARSET::get_fragment
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
MATRIX
Definition: matrix.h:578
GenericVector::size
int size() const
Definition: genericvector.h:72
tesseract::Wordrec::merge_and_put_fragment_lists
void merge_and_put_fragment_lists(int16_t row, int16_t column, int16_t num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
Definition: pieces.cpp:138
print_ratings_list
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:837
GENERIC_2D_ARRAY::get
T get(ICOORD pos) const
Definition: matrix.h:231
CHAR_FRAGMENT
Definition: unicharset.h:48
CHAR_FRAGMENT::get_pos
int get_pos() const
Definition: unicharset.h:71
seam.h
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:837