tesseract  4.1.1
reject.cpp File Reference
#include "tessvars.h"
#include <cctype>
#include <cerrno>
#include <cstring>
#include "genericvector.h"
#include "reject.h"
#include "control.h"
#include "docqual.h"
#include "helpers.h"
#include "tesseractclass.h"

Go to the source code of this file.

Namespaces

 tesseract
 

Functions

 CLISTIZEH (STRING) CLISTIZE(STRING) namespace tesseract
 
void reject_blanks (WERD_RES *word)
 
void reject_poor_matches (WERD_RES *word)
 
float compute_reject_threshold (WERD_CHOICE *word)
 

Function Documentation

◆ CLISTIZEH()

CLISTIZEH ( STRING  )

Definition at line 51 of file reject.cpp.

59  {
60 void Tesseract::set_done(WERD_RES *word, int16_t pass) {
61  word->done = word->tess_accepted &&
62  (strchr(word->best_choice->unichar_string().string(), ' ') == nullptr);
63  bool word_is_ambig = word->best_choice->dangerous_ambig_found();
64  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
65  word->best_choice->permuter() == FREQ_DAWG_PERM ||
67  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
68  one_ell_conflict(word, false)) {
69  if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
70  word->done = false;
71  }
72  if (word->done && ((!word_from_dict &&
73  word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
74  if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
75  word->done = false;
76  }
77  if (tessedit_rejection_debug) {
78  tprintf("set_done(): done=%d\n", word->done);
79  word->best_choice->print("");
80  }
81 }
82 
83 
84 /*************************************************************************
85  * make_reject_map()
86  *
87  * Sets the done flag to indicate whether the resylt is acceptable.
88  *
89  * Sets a reject map for the word.
90  *************************************************************************/
91 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
92  int i;
93  int offset;
94 
95  flip_0O(word);
96  check_debug_pt(word, -1); // For trap only
97  set_done(word, pass); // Set acceptance
99  reject_blanks(word);
100  /*
101  0: Rays original heuristic - the baseline
102  */
103  if (tessedit_reject_mode == 0) {
104  if (!word->done)
105  reject_poor_matches(word);
106  } else if (tessedit_reject_mode == 5) {
107  /*
108  5: Reject I/1/l from words where there is no strong contextual confirmation;
109  the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
110  and the whole of any words which are very small
111  */
112  if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
114  } else {
115  one_ell_conflict(word, true);
116  /*
117  Originally the code here just used the done flag. Now I have duplicated
118  and unpacked the conditions for setting the done flag so that each
119  mechanism can be turned on or off independently. This works WITHOUT
120  affecting the done flag setting.
121  */
122  if (rej_use_tess_accepted && !word->tess_accepted)
124 
125  if (rej_use_tess_blanks &&
126  (strchr (word->best_choice->unichar_string().string (), ' ') != nullptr))
128 
129  WERD_CHOICE* best_choice = word->best_choice;
130  if (rej_use_good_perm) {
131  if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
132  best_choice->permuter() == FREQ_DAWG_PERM ||
133  best_choice->permuter() == USER_DAWG_PERM) &&
134  (!rej_use_sensible_wd ||
135  acceptable_word_string(*word->uch_set,
136  best_choice->unichar_string().string(),
137  best_choice->unichar_lengths().string()) !=
138  AC_UNACCEPTABLE)) {
139  // PASSED TEST
140  } else if (best_choice->permuter() == NUMBER_PERM) {
141  if (rej_alphas_in_number_perm) {
142  for (i = 0, offset = 0;
143  best_choice->unichar_string()[offset] != '\0';
144  offset += best_choice->unichar_lengths()[i++]) {
145  if (word->reject_map[i].accepted() &&
146  word->uch_set->get_isalpha(
147  best_choice->unichar_string().string() + offset,
148  best_choice->unichar_lengths()[i]))
149  word->reject_map[i].setrej_bad_permuter();
150  // rej alpha
151  }
152  }
153  } else {
155  }
156  }
157  /* Ambig word rejection was here once !!*/
158  }
159  } else {
160  tprintf("BAD tessedit_reject_mode\n");
161  ASSERT_HOST("Fatal error encountered!" == nullptr);
162  }
163 
164  if (tessedit_image_border > -1)
165  reject_edge_blobs(word);
166 
167  check_debug_pt (word, 10);
168  if (tessedit_rejection_debug) {
169  tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
170  tprintf("Certainty: %f Rating: %f\n",
171  word->best_choice->certainty (), word->best_choice->rating ());
172  tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
173  }
174 
175  flip_hyphens(word);
176  check_debug_pt(word, 20);
177 }
178 } // namespace tesseract
@ AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:30
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:181
void reject_poor_matches(WERD_RES *word)
Definition: reject.cpp:210
void flip_hyphens(WERD_RES *word)
void flip_0O(WERD_RES *word)
const int kBlnXHeight
Definition: normalis.h:24
@ FREQ_DAWG_PERM
Definition: ratngs.h:244
@ USER_DAWG_PERM
Definition: ratngs.h:243
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:241
@ NUMBER_PERM
Definition: ratngs.h:239
#define ASSERT_HOST(x)
Definition: errcode.h:88
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
float y_scale() const
Definition: normalis.h:270
Definition: ocrrow.h:37
const UNICHARSET * uch_set
Definition: pageres.h:203
DENORM denorm
Definition: pageres.h:201
bool done
Definition: pageres.h:305
WERD_CHOICE * best_choice
Definition: pageres.h:241
bool tess_accepted
Definition: pageres.h:303
REJMAP reject_map
Definition: pageres.h:294
bool dangerous_ambig_found() const
Definition: ratngs.h:353
uint8_t permuter() const
Definition: ratngs.h:336
float certainty() const
Definition: ratngs.h:320
const STRING & unichar_lengths() const
Definition: ratngs.h:538
const STRING & unichar_string() const
Definition: ratngs.h:531
void print() const
Definition: ratngs.h:570
float rating() const
Definition: ratngs.h:317
void rej_word_bad_permuter()
Definition: rejctmap.cpp:379
void rej_word_not_tess_accepted()
Definition: rejctmap.cpp:361
void initialise(int16_t length)
Definition: rejctmap.cpp:273
void rej_word_small_xht()
Definition: rejctmap.cpp:343
void rej_word_contains_blanks()
Definition: rejctmap.cpp:370
int32_t length() const
Definition: strngs.cpp:189
const char * string() const
Definition: strngs.cpp:194
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491

◆ compute_reject_threshold()

float compute_reject_threshold ( WERD_CHOICE word)

Definition at line 229 of file reject.cpp.

229  {
230  float threshold; // rejection threshold
231  float bestgap = 0.0f; // biggest gap
232  float gapstart; // bottom of gap
233 
234  int blob_count = word->length();
235  GenericVector<float> ratings;
236  ratings.resize_no_init(blob_count);
237  for (int i = 0; i < blob_count; ++i) {
238  ratings[i] = word->certainty(i);
239  }
240  ratings.sort();
241  gapstart = ratings[0] - 1; // all reject if none better
242  if (blob_count >= 3) {
243  for (int index = 0; index < blob_count - 1; index++) {
244  if (ratings[index + 1] - ratings[index] > bestgap) {
245  bestgap = ratings[index + 1] - ratings[index];
246  // find biggest
247  gapstart = ratings[index];
248  }
249  }
250  }
251  threshold = gapstart + bestgap / 2;
252 
253  return threshold;
254 }
void resize_no_init(int size)
Definition: genericvector.h:66
int length() const
Definition: ratngs.h:293

◆ reject_blanks()

void reject_blanks ( WERD_RES word)

Definition at line 181 of file reject.cpp.

181  {
182  int16_t i;
183  int16_t offset;
184 
185  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
186  offset += word->best_choice->unichar_lengths()[i], i += 1) {
187  if (word->best_choice->unichar_string()[offset] == ' ')
188  //rej unrecognised blobs
189  word->reject_map[i].setrej_tess_failure ();
190  }
191 }

◆ reject_poor_matches()

void reject_poor_matches ( WERD_RES word)

Definition at line 210 of file reject.cpp.

210  {
211  float threshold = compute_reject_threshold(word->best_choice);
212  for (int i = 0; i < word->best_choice->length(); ++i) {
213  if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
214  word->reject_map[i].setrej_tess_failure();
215  else if (word->best_choice->certainty(i) < threshold)
216  word->reject_map[i].setrej_poor_match();
217  }
218 }
float compute_reject_threshold(WERD_CHOICE *word)
Definition: reject.cpp:229
@ UNICHAR_SPACE
Definition: unicharset.h:34
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305