tesseract  4.1.1
reject.cpp File Reference
#include "tessvars.h"
#include <cctype>
#include <cerrno>
#include <cstring>
#include "genericvector.h"
#include "reject.h"
#include "control.h"
#include "docqual.h"
#include "helpers.h"
#include "tesseractclass.h"

Go to the source code of this file.

Namespaces

 tesseract
 

Functions

 CLISTIZEH (STRING) CLISTIZE(STRING) namespace tesseract
 
void reject_blanks (WERD_RES *word)
 
void reject_poor_matches (WERD_RES *word)
 
float compute_reject_threshold (WERD_CHOICE *word)
 

Function Documentation

◆ CLISTIZEH()

CLISTIZEH ( STRING  )

Definition at line 51 of file reject.cpp.

59  {
60 void Tesseract::set_done(WERD_RES *word, int16_t pass) {
61  word->done = word->tess_accepted &&
62  (strchr(word->best_choice->unichar_string().string(), ' ') == nullptr);
63  bool word_is_ambig = word->best_choice->dangerous_ambig_found();
64  bool word_from_dict = word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
65  word->best_choice->permuter() == FREQ_DAWG_PERM ||
67  if (word->done && (pass == 1) && (!word_from_dict || word_is_ambig) &&
68  one_ell_conflict(word, false)) {
69  if (tessedit_rejection_debug) tprintf("one_ell_conflict detected\n");
70  word->done = false;
71  }
72  if (word->done && ((!word_from_dict &&
73  word->best_choice->permuter() != NUMBER_PERM) || word_is_ambig)) {
74  if (tessedit_rejection_debug) tprintf("non-dict or ambig word detected\n");
75  word->done = false;
76  }
77  if (tessedit_rejection_debug) {
78  tprintf("set_done(): done=%d\n", word->done);
79  word->best_choice->print("");
80  }
81 }
82 
83 
84 /*************************************************************************
85  * make_reject_map()
86  *
87  * Sets the done flag to indicate whether the resylt is acceptable.
88  *
89  * Sets a reject map for the word.
90  *************************************************************************/
91 void Tesseract::make_reject_map(WERD_RES *word, ROW *row, int16_t pass) {
92  int i;
93  int offset;
94 
95  flip_0O(word);
96  check_debug_pt(word, -1); // For trap only
97  set_done(word, pass); // Set acceptance
99  reject_blanks(word);
100  /*
101  0: Rays original heuristic - the baseline
102  */
103  if (tessedit_reject_mode == 0) {
104  if (!word->done)
105  reject_poor_matches(word);
106  } else if (tessedit_reject_mode == 5) {
107  /*
108  5: Reject I/1/l from words where there is no strong contextual confirmation;
109  the whole of any unacceptable words (incl PERM rej of dubious 1/I/ls);
110  and the whole of any words which are very small
111  */
112  if (kBlnXHeight / word->denorm.y_scale() <= min_sane_x_ht_pixels) {
114  } else {
115  one_ell_conflict(word, true);
116  /*
117  Originally the code here just used the done flag. Now I have duplicated
118  and unpacked the conditions for setting the done flag so that each
119  mechanism can be turned on or off independently. This works WITHOUT
120  affecting the done flag setting.
121  */
122  if (rej_use_tess_accepted && !word->tess_accepted)
124 
125  if (rej_use_tess_blanks &&
126  (strchr (word->best_choice->unichar_string().string (), ' ') != nullptr))
128 
129  WERD_CHOICE* best_choice = word->best_choice;
130  if (rej_use_good_perm) {
131  if ((best_choice->permuter() == SYSTEM_DAWG_PERM ||
132  best_choice->permuter() == FREQ_DAWG_PERM ||
133  best_choice->permuter() == USER_DAWG_PERM) &&
134  (!rej_use_sensible_wd ||
135  acceptable_word_string(*word->uch_set,
136  best_choice->unichar_string().string(),
137  best_choice->unichar_lengths().string()) !=
138  AC_UNACCEPTABLE)) {
139  // PASSED TEST
140  } else if (best_choice->permuter() == NUMBER_PERM) {
141  if (rej_alphas_in_number_perm) {
142  for (i = 0, offset = 0;
143  best_choice->unichar_string()[offset] != '\0';
144  offset += best_choice->unichar_lengths()[i++]) {
145  if (word->reject_map[i].accepted() &&
146  word->uch_set->get_isalpha(
147  best_choice->unichar_string().string() + offset,
148  best_choice->unichar_lengths()[i]))
149  word->reject_map[i].setrej_bad_permuter();
150  // rej alpha
151  }
152  }
153  } else {
155  }
156  }
157  /* Ambig word rejection was here once !!*/
158  }
159  } else {
160  tprintf("BAD tessedit_reject_mode\n");
161  ASSERT_HOST("Fatal error encountered!" == nullptr);
162  }
163 
164  if (tessedit_image_border > -1)
165  reject_edge_blobs(word);
166 
167  check_debug_pt (word, 10);
168  if (tessedit_rejection_debug) {
169  tprintf("Permuter Type = %d\n", word->best_choice->permuter ());
170  tprintf("Certainty: %f Rating: %f\n",
171  word->best_choice->certainty (), word->best_choice->rating ());
172  tprintf("Dict word: %d\n", dict_word(*(word->best_choice)));
173  }
174 
175  flip_hyphens(word);
176  check_debug_pt(word, 20);
177 }
178 } // namespace tesseract

◆ compute_reject_threshold()

float compute_reject_threshold ( WERD_CHOICE word)

Definition at line 229 of file reject.cpp.

229  {
230  float threshold; // rejection threshold
231  float bestgap = 0.0f; // biggest gap
232  float gapstart; // bottom of gap
233 
234  int blob_count = word->length();
235  GenericVector<float> ratings;
236  ratings.resize_no_init(blob_count);
237  for (int i = 0; i < blob_count; ++i) {
238  ratings[i] = word->certainty(i);
239  }
240  ratings.sort();
241  gapstart = ratings[0] - 1; // all reject if none better
242  if (blob_count >= 3) {
243  for (int index = 0; index < blob_count - 1; index++) {
244  if (ratings[index + 1] - ratings[index] > bestgap) {
245  bestgap = ratings[index + 1] - ratings[index];
246  // find biggest
247  gapstart = ratings[index];
248  }
249  }
250  }
251  threshold = gapstart + bestgap / 2;
252 
253  return threshold;
254 }

◆ reject_blanks()

void reject_blanks ( WERD_RES word)

Definition at line 181 of file reject.cpp.

181  {
182  int16_t i;
183  int16_t offset;
184 
185  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
186  offset += word->best_choice->unichar_lengths()[i], i += 1) {
187  if (word->best_choice->unichar_string()[offset] == ' ')
188  //rej unrecognised blobs
189  word->reject_map[i].setrej_tess_failure ();
190  }
191 }

◆ reject_poor_matches()

void reject_poor_matches ( WERD_RES word)

Definition at line 210 of file reject.cpp.

210  {
211  float threshold = compute_reject_threshold(word->best_choice);
212  for (int i = 0; i < word->best_choice->length(); ++i) {
213  if (word->best_choice->unichar_id(i) == UNICHAR_SPACE)
214  word->reject_map[i].setrej_tess_failure();
215  else if (word->best_choice->certainty(i) < threshold)
216  word->reject_map[i].setrej_poor_match();
217  }
218 }
WERD_CHOICE::dangerous_ambig_found
bool dangerous_ambig_found() const
Definition: ratngs.h:353
STRING::string
const char * string() const
Definition: strngs.cpp:194
WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:241
GenericVector::sort
void sort()
Definition: genericvector.h:1146
reject_poor_matches
void reject_poor_matches(WERD_RES *word)
Definition: reject.cpp:210
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:531
flip_hyphens
void flip_hyphens(WERD_RES *word)
WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:203
SYSTEM_DAWG_PERM
@ SYSTEM_DAWG_PERM
Definition: ratngs.h:241
REJMAP::rej_word_contains_blanks
void rej_word_contains_blanks()
Definition: rejctmap.cpp:370
UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
STRING::length
int32_t length() const
Definition: strngs.cpp:189
WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:317
WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:294
AC_UNACCEPTABLE
@ AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:30
REJMAP::rej_word_bad_permuter
void rej_word_bad_permuter()
Definition: rejctmap.cpp:379
GenericVector< float >
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:88
WERD_CHOICE::unichar_lengths
const STRING & unichar_lengths() const
Definition: ratngs.h:538
DENORM::y_scale
float y_scale() const
Definition: normalis.h:270
WERD_RES::tess_accepted
bool tess_accepted
Definition: pageres.h:303
WERD_RES::done
bool done
Definition: pageres.h:305
compute_reject_threshold
float compute_reject_threshold(WERD_CHOICE *word)
Definition: reject.cpp:229
kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:24
WERD_RES
Definition: pageres.h:166
NUMBER_PERM
@ NUMBER_PERM
Definition: ratngs.h:239
REJMAP::rej_word_not_tess_accepted
void rej_word_not_tess_accepted()
Definition: rejctmap.cpp:361
WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:320
REJMAP::initialise
void initialise(int16_t length)
Definition: rejctmap.cpp:273
USER_DAWG_PERM
@ USER_DAWG_PERM
Definition: ratngs.h:243
GenericVector::resize_no_init
void resize_no_init(int size)
Definition: genericvector.h:66
FREQ_DAWG_PERM
@ FREQ_DAWG_PERM
Definition: ratngs.h:244
WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:336
WERD_RES::denorm
DENORM denorm
Definition: pageres.h:201
WERD_CHOICE::length
int length() const
Definition: ratngs.h:293
ROW
Definition: ocrrow.h:37
WERD_CHOICE::print
void print() const
Definition: ratngs.h:570
reject_blanks
void reject_blanks(WERD_RES *word)
Definition: reject.cpp:181
flip_0O
void flip_0O(WERD_RES *word)
WERD_CHOICE
Definition: ratngs.h:263
UNICHAR_SPACE
@ UNICHAR_SPACE
Definition: unicharset.h:34
REJMAP::rej_word_small_xht
void rej_word_small_xht()
Definition: rejctmap.cpp:343