#include <tesseractclass.h>
|
| Tesseract () |
|
| ~Tesseract () override |
|
Dict & | getDict () override |
|
void | Clear () |
|
void | ResetAdaptiveClassifier () |
|
void | ResetDocumentDictionary () |
|
void | SetEquationDetect (EquationDetect *detector) |
|
const FCOORD & | reskew () const |
|
Pix ** | mutable_pix_binary () |
|
Pix * | pix_binary () const |
|
Pix * | pix_grey () const |
|
void | set_pix_grey (Pix *grey_pix) |
|
Pix * | pix_original () const |
|
void | set_pix_original (Pix *original_pix) |
|
Pix * | BestPix () const |
|
void | set_pix_thresholds (Pix *thresholds) |
|
int | source_resolution () const |
|
void | set_source_resolution (int ppi) |
|
int | ImageWidth () const |
|
int | ImageHeight () const |
|
Pix * | scaled_color () const |
|
int | scaled_factor () const |
|
void | SetScaledColor (int factor, Pix *color) |
|
const Textord & | textord () const |
|
Textord * | mutable_textord () |
|
bool | right_to_left () const |
|
int | num_sub_langs () const |
|
Tesseract * | get_sub_lang (int index) const |
|
bool | AnyTessLang () const |
|
bool | AnyLSTMLang () const |
|
void | SetBlackAndWhitelist () |
|
void | PrepareForPageseg () |
|
void | PrepareForTessOCR (BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr) |
|
int | SegmentPage (const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr) |
|
void | SetupWordScripts (BLOCK_LIST *blocks) |
|
int | AutoPageSeg (PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr) |
|
ColumnFinder * | SetupPageSegAndDetectOrientation (PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix) |
|
void | PrerecAllWordsPar (const GenericVector< WordData > &words) |
|
bool | TrainLineRecognizer (const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list) |
|
void | TrainFromBoxes (const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data) |
|
ImageData * | GetLineData (const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block) |
|
ImageData * | GetRectImage (const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const |
|
void | LSTMRecognizeWord (const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words) |
|
void | SearchWords (PointerVector< WERD_RES > *words) |
|
bool | ProcessTargetWord (const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass) |
|
void | SetupAllWordsPassN (int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words) |
|
void | SetupWordPassN (int pass_n, WordData *word) |
|
bool | RecogAllWordsPassN (int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words) |
|
bool | recog_all_words (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses) |
|
void | rejection_passes (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config) |
|
void | bigram_correction_pass (PAGE_RES *page_res) |
|
void | blamer_pass (PAGE_RES *page_res) |
|
void | script_pos_pass (PAGE_RES *page_res) |
|
int | RetryWithLanguage (const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words) |
|
bool | ReassignDiacritics (int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy) |
|
void | AssignDiacriticsToOverlappingBlobs (const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB * > *target_blobs) |
|
void | AssignDiacriticsToNewBlobs (const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB * > *target_blobs) |
|
bool | SelectGoodDiacriticOutlines (int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines) |
|
float | ClassifyBlobPlusOutlines (const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str) |
|
float | ClassifyBlobAsWord (int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2) |
|
void | classify_word_and_language (int pass_n, PAGE_RES_IT *pr_it, WordData *word_data) |
|
void | classify_word_pass1 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words) |
|
void | recog_pseudo_word (PAGE_RES *page_res, TBOX &selection_box) |
|
void | fix_rep_char (PAGE_RES_IT *page_res_it) |
|
ACCEPTABLE_WERD_TYPE | acceptable_word_string (const UNICHARSET &char_set, const char *s, const char *lengths) |
|
void | match_word_pass_n (int pass_n, WERD_RES *word, ROW *row, BLOCK *block) |
|
void | classify_word_pass2 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words) |
|
void | ReportXhtFixResult (bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word) |
|
bool | RunOldFixXht (WERD_RES *word, BLOCK *block, ROW *row) |
|
bool | TrainedXheightFix (WERD_RES *word, BLOCK *block, ROW *row) |
|
bool | TestNewNormalization (int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row) |
|
bool | recog_interactive (PAGE_RES_IT *pr_it) |
|
void | set_word_fonts (WERD_RES *word) |
|
void | font_recognition_pass (PAGE_RES *page_res) |
|
void | dictionary_correction_pass (PAGE_RES *page_res) |
|
bool | check_debug_pt (WERD_RES *word, int location) |
|
bool | SubAndSuperscriptFix (WERD_RES *word_res) |
|
void | GetSubAndSuperscriptCandidates (const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold) |
|
WERD_RES * | TrySuperscriptSplits (int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing) |
|
bool | BelievableSuperscript (bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const |
|
void | output_pass (PAGE_RES_IT &page_res_it, const TBOX *target_word_box) |
|
void | write_results (PAGE_RES_IT &page_res_it, char newline_type, bool force_eol) |
|
void | set_unlv_suspects (WERD_RES *word) |
|
UNICHAR_ID | get_rep_char (WERD_RES *word) |
|
bool | acceptable_number_string (const char *s, const char *lengths) |
|
int16_t | count_alphanums (const WERD_CHOICE &word) |
|
int16_t | count_alphas (const WERD_CHOICE &word) |
|
void | read_config_file (const char *filename, SetParamConstraint constraint) |
|
int | init_tesseract (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr) |
|
int | init_tesseract (const char *datapath, const char *language, OcrEngineMode oem) |
|
int | init_tesseract_internal (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr) |
|
void | SetupUniversalFontIds () |
|
int | init_tesseract_lm (const char *arg0, const char *textbase, const char *language, TessdataManager *mgr) |
|
void | recognize_page (STRING &image_name) |
|
void | end_tesseract () |
|
bool | init_tesseract_lang_data (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr) |
|
void | ParseLanguageString (const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load) |
|
SVMenuNode * | build_menu_new () |
|
void | pgeditor_main (int width, int height, PAGE_RES *page_res) |
|
void | process_image_event (const SVEvent &event) |
|
bool | process_cmd_win_event (int32_t cmd_event, char *new_value) |
|
void | debug_word (PAGE_RES *page_res, const TBOX &selection_box) |
|
void | do_re_display (bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it)) |
|
bool | word_display (PAGE_RES_IT *pr_it) |
|
bool | word_bln_display (PAGE_RES_IT *pr_it) |
|
bool | word_blank_and_set_display (PAGE_RES_IT *pr_its) |
|
bool | word_set_display (PAGE_RES_IT *pr_it) |
|
bool | word_dumper (PAGE_RES_IT *pr_it) |
|
void | blob_feature_display (PAGE_RES *page_res, const TBOX &selection_box) |
|
void | make_reject_map (WERD_RES *word, ROW *row, int16_t pass) |
|
bool | one_ell_conflict (WERD_RES *word_res, bool update_map) |
|
int16_t | first_alphanum_index (const char *word, const char *word_lengths) |
|
int16_t | first_alphanum_offset (const char *word, const char *word_lengths) |
|
int16_t | alpha_count (const char *word, const char *word_lengths) |
|
bool | word_contains_non_1_digit (const char *word, const char *word_lengths) |
|
void | dont_allow_1Il (WERD_RES *word) |
|
int16_t | count_alphanums (WERD_RES *word) |
|
void | flip_0O (WERD_RES *word) |
|
bool | non_0_digit (const UNICHARSET &ch_set, UNICHAR_ID unichar_id) |
|
bool | non_O_upper (const UNICHARSET &ch_set, UNICHAR_ID unichar_id) |
|
bool | repeated_nonalphanum_wd (WERD_RES *word, ROW *row) |
|
void | nn_match_word (WERD_RES *word, ROW *row) |
|
void | nn_recover_rejects (WERD_RES *word, ROW *row) |
|
void | set_done (WERD_RES *word, int16_t pass) |
|
int16_t | safe_dict_word (const WERD_RES *werd_res) |
|
void | flip_hyphens (WERD_RES *word) |
|
void | reject_I_1_L (WERD_RES *word) |
|
void | reject_edge_blobs (WERD_RES *word) |
|
void | reject_mostly_rejects (WERD_RES *word) |
|
bool | word_adaptable (WERD_RES *word, uint16_t mode) |
|
void | recog_word_recursive (WERD_RES *word) |
|
void | recog_word (WERD_RES *word) |
|
void | split_and_recog_word (WERD_RES *word) |
|
void | split_word (WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const |
|
void | join_words (WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const |
|
void | match_current_words (WERD_RES_LIST &words, ROW *row, BLOCK *block) |
|
int16_t | fp_eval_word_spacing (WERD_RES_LIST &word_res_list) |
|
void | dump_words (WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved) |
|
bool | fixspace_thinks_word_done (WERD_RES *word) |
|
GARBAGE_LEVEL | garbage_word (WERD_RES *word, bool ok_dict_word) |
|
bool | potential_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word) |
|
void | tilde_crunch (PAGE_RES_IT &page_res_it) |
|
void | unrej_good_quality_words (PAGE_RES_IT &page_res_it) |
|
void | doc_and_block_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc) |
|
void | quality_based_rejection (PAGE_RES_IT &page_res_it, bool good_quality_doc) |
|
void | convert_bad_unlv_chs (WERD_RES *word_res) |
|
void | tilde_delete (PAGE_RES_IT &page_res_it) |
|
int16_t | word_blob_quality (WERD_RES *word, ROW *row) |
|
void | word_char_quality (WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count) |
|
void | unrej_good_chs (WERD_RES *word, ROW *row) |
|
int16_t | count_outline_errs (char c, int16_t outline_count) |
|
int16_t | word_outline_errs (WERD_RES *word) |
|
bool | terrible_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level) |
|
CRUNCH_MODE | word_deletable (WERD_RES *word, int16_t &delete_mode) |
|
int16_t | failure_count (WERD_RES *word) |
|
bool | noise_outlines (TWERD *word) |
|
void | tess_segment_pass_n (int pass_n, WERD_RES *word) |
|
PAGE_RES * | ApplyBoxes (const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list) |
|
void | PreenXHeights (BLOCK_LIST *block_list) |
|
PAGE_RES * | SetupApplyBoxes (const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list) |
|
void | MaximallyChopWord (const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res) |
|
bool | ResegmentCharBox (PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text) |
|
bool | ResegmentWordBox (BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text) |
|
void | ReSegmentByClassification (PAGE_RES *page_res) |
|
bool | ConvertStringToUnichars (const char *utf8, GenericVector< UNICHAR_ID > *class_ids) |
|
bool | FindSegmentation (const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res) |
|
void | SearchForText (const GenericVector< BLOB_CHOICE_LIST * > *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation) |
|
void | TidyUp (PAGE_RES *page_res) |
|
void | ReportFailedBox (int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg) |
|
void | CorrectClassifyWords (PAGE_RES *page_res) |
|
void | ApplyBoxTraining (const STRING &fontname, PAGE_RES *page_res) |
|
int | CountMisfitTops (WERD_RES *word_res) |
|
float | ComputeCompatibleXheight (WERD_RES *word_res, float *baseline_shift) |
|
FILE * | init_recog_training (const STRING &fname) |
|
void | recog_training_segmented (const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file) |
|
void | ambigs_classify_and_output (const char *label, PAGE_RES_IT *pr_it, FILE *output_file) |
|
|
The basic measure is the number of characters in contextually confirmed words. (I.e the word is done) If all words are contextually confirmed the evaluation is deemed perfect.
Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is preferred.
The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space.
Conversely, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 sides of a "1" joined.
The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.
|
bool | digit_or_numeric_punct (WERD_RES *word, int char_position) |
|
int16_t | eval_word_spacing (WERD_RES_LIST &word_res_list) |
|
|
Test the current word to see if it can be split by deleting noise blobs. If so, do the business. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words.
|
void | fix_noisy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) |
|
void | fix_sp_fp_word (WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) |
|
int16_t | worst_noise_blob (WERD_RES *word_res, float *worst_noise_score) |
|
float | blob_noise_score (TBLOB *blob) |
|
void | break_noisiest_blob_word (WERD_RES_LIST &words) |
|
|
Walk over the page finding sequences of words joined by fuzzy spaces. Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.
- Parameters
-
| monitor | progress monitor |
| word_count | count of words in doc |
[out] | page_res | |
|
void | fix_fuzzy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) |
|
void | fix_fuzzy_spaces (ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res) |
|
|
Walk the current block list applying the specified word processor function to each word that overlaps the selection_box.
|
void | process_selected_words (PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it)) |
|
|
Add the given word to the document dictionary
|
void | tess_add_doc_word (WERD_CHOICE *word_choice) |
|
|
- Returns
- true if the word is regarded as "good enough".
- Parameters
-
word_choice | after context |
raw_choice | before context |
|
bool | tess_acceptable_word (WERD_RES *word) |
|
| Wordrec () |
|
| ~Wordrec () override=default |
|
void | SaveAltChoices (const LIST &best_choices, WERD_RES *word) |
|
void | FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) |
|
void | CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) |
|
void | SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
|
void | InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
|
void | DoSegSearch (WERD_RES *word_res) |
|
void | add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams) |
|
void | choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile) |
|
void | combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue) |
|
SEAM * | pick_good_seam (TBLOB *blob) |
|
void | try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob) |
|
void | try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], int16_t num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob) |
|
PRIORITY | grade_split_length (SPLIT *split) |
|
PRIORITY | grade_sharpness (SPLIT *split) |
|
bool | near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt) |
|
virtual BLOB_CHOICE_LIST * | classify_piece (const GenericVector< SEAM * > &seams, int16_t start, int16_t end, const char *description, TWERD *word, BlamerBundle *blamer_bundle) |
|
void | merge_fragments (MATRIX *ratings, int16_t num_blobs) |
|
void | get_fragment_lists (int16_t current_frag, int16_t current_row, int16_t start, int16_t num_frag_parts, int16_t num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists) |
|
void | merge_and_put_fragment_lists (int16_t row, int16_t column, int16_t num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings) |
|
void | fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices) |
|
void | program_editup (const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict) |
|
void | cc_recog (WERD_RES *word) |
|
void | program_editdown (int32_t elasped_time) |
|
void | set_pass1 () |
|
void | set_pass2 () |
|
int | end_recog () |
|
BLOB_CHOICE_LIST * | call_matcher (TBLOB *blob) |
|
int | dict_word (const WERD_CHOICE &word) |
|
BLOB_CHOICE_LIST * | classify_blob (TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle) |
|
PRIORITY | point_priority (EDGEPT *point) |
|
void | add_point_to_list (PointHeap *point_heap, EDGEPT *point) |
|
bool | is_inside_angle (EDGEPT *pt) |
|
int | angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3) |
|
EDGEPT * | pick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist) |
|
void | prioritize_points (TESSLINE *outline, PointHeap *points) |
|
void | new_min_point (EDGEPT *local_min, PointHeap *points) |
|
void | new_max_point (EDGEPT *local_max, PointHeap *points) |
|
void | vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points) |
|
SEAM * | attempt_blob_chop (TWERD *word, TBLOB *blob, int32_t blob_number, bool italic_blob, const GenericVector< SEAM * > &seams) |
|
SEAM * | chop_numbered_blob (TWERD *word, int32_t blob_number, bool italic_blob, const GenericVector< SEAM * > &seams) |
|
SEAM * | chop_overlapping_blob (const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number) |
|
SEAM * | improve_one_blob (const GenericVector< BLOB_CHOICE * > &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number) |
|
SEAM * | chop_one_blob (const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number) |
|
void | chop_word_main (WERD_RES *word) |
|
void | improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending) |
|
int | select_blob_to_split (const GenericVector< BLOB_CHOICE * > &blob_choices, float rating_ceiling, bool split_next_to_fragment) |
|
int | select_blob_to_split_from_fixpt (DANGERR *fixpt) |
|
| Classify () |
|
| ~Classify () override |
|
const ShapeTable * | shape_table () const |
|
void | SetStaticClassifier (ShapeClassifier *static_classifier) |
|
void | AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices) |
|
bool | LargeSpeckle (const TBLOB &blob) |
|
ADAPT_TEMPLATES | NewAdaptedTemplates (bool InitFromUnicharset) |
|
int | GetFontinfoId (ADAPT_CLASS Class, uint8_t ConfigId) |
|
int | PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uint8_t *normalization_factors, const uint16_t *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results) |
|
void | ReadNewCutoffs (TFile *fp, uint16_t *Cutoffs) |
|
void | PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates) |
|
void | WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates) |
|
ADAPT_TEMPLATES | ReadAdaptedTemplates (TFile *File) |
|
float | ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, bool DebugMatch) |
|
void | FreeNormProtos () |
|
NORM_PROTOS * | ReadNormProtos (TFile *fp) |
|
void | ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class) |
|
INT_TEMPLATES | CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset) |
|
void | LearnWord (const char *fontname, WERD_RES *word) |
|
void | LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word) |
|
void | InitAdaptiveClassifier (TessdataManager *mgr) |
|
void | InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates) |
|
void | AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results) |
|
void | MasterMatcher (INT_TEMPLATES templates, int16_t num_features, const INT_FEATURE_STRUCT *features, const uint8_t *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results) |
|
void | ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uint8_t *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results) |
|
double | ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uint8_t *cn_factors) |
|
void | ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices) |
|
void | AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results) |
|
int | GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures) |
|
void | DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results) |
|
PROTO_ID | MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask) |
|
int | MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures) |
|
void | MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob) |
|
void | PrintAdaptiveMatchResults (const ADAPT_RESULTS &results) |
|
void | RemoveExtraPuncs (ADAPT_RESULTS *Results) |
|
void | RemoveBadMatches (ADAPT_RESULTS *Results) |
|
void | SetAdaptiveThreshold (float Threshold) |
|
void | ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features) |
|
STRING | ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const |
|
int | ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const |
|
int | ShapeIDToClassID (int shape_id) const |
|
UNICHAR_ID * | BaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results) |
|
int | CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results) |
|
int | CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results) |
|
UNICHAR_ID * | GetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass) |
|
void | DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results) |
|
void | AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, float Threshold, ADAPT_TEMPLATES adaptive_templates) |
|
void | DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class) |
|
bool | AdaptableWord (WERD_RES *word) |
|
void | EndAdaptiveClassifier () |
|
void | SettupPass1 () |
|
void | SettupPass2 () |
|
void | AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices) |
|
void | ClassifyAsNoise (ADAPT_RESULTS *Results) |
|
void | ResetAdaptiveClassifierInternal () |
|
void | SwitchAdaptiveClassifier () |
|
void | StartBackupAdaptiveClassifier () |
|
int | GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uint8_t *pruner_norm_array, uint8_t *char_norm_array) |
|
void | ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uint8_t *char_norm_array, uint8_t *pruner_array) |
|
bool | TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config) |
|
void | UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob) |
|
bool | AdaptiveClassifierIsFull () const |
|
bool | AdaptiveClassifierIsEmpty () const |
|
bool | LooksLikeGarbage (TBLOB *blob) |
|
void | RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox) |
|
void | ClearCharNormArray (uint8_t *char_norm_array) |
|
void | ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uint8_t *char_norm_array) |
|
void | ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures) |
|
INT_TEMPLATES | ReadIntTemplates (TFile *fp) |
|
void | WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset) |
|
CLASS_ID | GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id) |
|
void | ShowMatchDisplay () |
|
UnicityTable< FontInfo > & | get_fontinfo_table () |
|
const UnicityTable< FontInfo > & | get_fontinfo_table () const |
|
UnicityTable< FontSet > & | get_fontset_table () |
|
void | NormalizeOutlines (LIST Outlines, float *XScale, float *YScale) |
|
FEATURE_SET | ExtractOutlineFeatures (TBLOB *Blob) |
|
FEATURE_SET | ExtractPicoFeatures (TBLOB *Blob) |
|
FEATURE_SET | ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info) |
|
FEATURE_SET | ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info) |
|
void | LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text) |
|
bool | WriteTRFile (const STRING &filename) |
|
| CCStruct ()=default |
|
| ~CCStruct () override |
|
| CUtil ()=default |
|
| ~CUtil () override |
|
void | read_variables (const char *filename, bool global_only) |
|
| CCUtil () |
|
virtual | ~CCUtil () |
|
void | main_setup (const char *argv0, const char *basename) |
| CCUtil::main_setup - set location of tessdata and name of image. More...
|
|
ParamsVectors * | params () |
|
|
static void | SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info) |
|
static void | ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts) |
|
static const double | kDescenderFraction = 0.25 |
|
static const double | kXHeightFraction = 0.5 |
|
static const double | kAscenderFraction = 0.25 |
|
static const double | kXHeightCapRatio |
|
bool | SegSearchDone (int num_futile_classifications) |
|
void | UpdateSegSearchNodes (float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
|
void | ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle) |
|
void | ResetNGramSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending) |
|
void | InitBlamerForSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug) |
|
IntegerMatcher | im_ |
|
FEATURE_DEFS_STRUCT | feature_defs_ |
|
ShapeTable * | shape_table_ |
|
Definition at line 174 of file tesseractclass.h.
◆ Tesseract()
tesseract::Tesseract::Tesseract |
( |
| ) |
|
Definition at line 52 of file tesseractclass.cpp.
54 "Take segmentation and labeling from box file",
57 "Conversion of word/line box file to char box file",
60 "Generate training data from boxed chars", this->
params()),
62 "Generate more boxes from boxed chars", this->
params()),
64 "Break input into lines and remap boxes if present",
67 "Dump intermediate images made during page segmentation",
70 "Try inverting the image in `LSTMRecognizeWord`", this->
params()),
75 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto_only, 3=auto, 4=column,"
76 " 5=block_vert, 6=block, 7=line, 8=word, 9=word_circle, 10=char,"
77 "11=sparse_text, 12=sparse_text+osd, 13=raw_line"
78 " (Values from PageSegMode enum in publictypes.h)",
81 "Which OCR engine(s) to run (Tesseract, LSTM, both)."
82 " Defaults to loading and running the most accurate"
86 "Blacklist of chars not to recognize", this->
params()),
88 "Whitelist of chars to recognize", this->
params()),
90 "List of chars to override tessedit_char_blacklist",
93 "Perform training for ambiguities", this->
params()),
96 "Whether to use the top-line splitting process for Devanagari "
97 "documents while performing page-segmentation.",
101 "Whether to use the top-line splitting process for Devanagari "
102 "documents while performing ocr.",
105 "Write all parameters to the given file.", this->
params()),
107 "Generate and print debug"
108 " information for adaption",
115 "Exposure value follows"
116 " this pattern in the image filename. The name of the image"
117 " files are expected to be in the form"
118 " [lang].[fontname].exp[num].tif",
121 "Learn both character fragments (as is done in the"
122 " special low exposure mode) as well as unfragmented"
127 " is assumed to contain ngrams. Only learn the ngrams"
128 " whose outlines overlap horizontally.",
137 "Try to improve fuzzy spaces", this->
params()),
139 "Don't bother with word plausibility", this->
params()),
143 "Add words to the document dictionary", this->
params()),
149 "Enable correction based on the word bigram dictionary.",
152 "Enable single word correction based on the dictionary.",
155 "Amount of debug output for bigram correction.",
158 "Remove and conditionally reassign small outlines when they"
159 " confuse layout analysis, determining diacritics vs noise",
167 "Hingepoint for base char certainty", this->
params()),
171 "Hingepoint for disjoint certainty", this->
params()),
175 "Threshold for new punc char certainty", this->
params()),
178 "Scaling on certainty diff from Hingepoint",
192 "good_quality_doc lte rejection limit", this->
params()),
194 "good_quality_doc gte good blobs limit", this->
params()),
196 "good_quality_doc lte outline error limit", this->
params()),
198 "good_quality_doc gte good char limit", this->
params()),
202 "Adaptation decision algorithm for tess", this->
params()),
204 "Do minimal rejection on pass 1 output", this->
params()),
215 "Run paragraph detection on the post-text-recognition "
219 "Use ratings matrix/beam search with lstm", this->
params()),
225 "Reduce rejection on good docs", this->
params()),
229 "%rej allowed before rej whole doc", this->
params()),
231 "%rej allowed before rej whole block", this->
params()),
233 "%rej allowed before rej whole row", this->
params()),
235 "Number of row rejects in whole word rejects"
236 " which prevents whole row rejection",
239 "Only rej partially rejected words in block rejection",
242 "Only rej partially rejected words in row rejection",
245 "Use word segmentation quality metric", this->
params()),
247 "Use word segmentation quality metric", this->
params()),
249 "Only preserve wds longer than this", this->
params()),
251 "Apply row rejection to good docs", this->
params()),
253 "rej good doc wd if more than this fraction rejected",
256 "Reject all bad quality wds", this->
params()),
260 "Output data to debug file", this->
params()),
264 "good_quality_doc gte good char limit", this->
params()),
266 "Mark v.bad words for tilde crunch", this->
params()),
274 "Take out ~^ early?", this->
params()),
279 "crunch garbage cert lt this", this->
params()),
281 "crunch garbage rating lt this", this->
params()),
295 "Del if word width lt xht x this", this->
params()),
297 "Del if word gt xht x this above bl", this->
params()),
299 "Del if word gt xht x this below bl", this->
params()),
305 "How many potential indicators needed", this->
params()),
311 "Don't pot crunch sensible strings", this->
params()),
315 "Don't crunch words with long lower case strings",
318 "Don't crunch words with long lower case strings",
321 "Crunch words with long repetitions", this->
params()),
324 "How many non-noise blbs either side?", this->
params()),
328 "Reward punctuation joins", this->
params()),
334 "Punct. chs expected WITHIN numbers", this->
params()),
336 "Max allowed deviation of blob top outside of font data",
339 "Min change in xht before actually trying it", this->
params()),
341 "Debug level for sub & superscript fixer", this->
params()),
344 "How many times worse "
345 "certainty does a superscript position glyph need to be for "
346 "us to try classifying it as a char with a different "
352 "badness do we think sufficient to choose a superscript "
353 "over what we'd thought. For example, a value of 0.6 means "
354 "we want to reduce badness of certainty by at least 40%",
357 "A superscript scaled down more than this is unbelievably "
358 "small. For example, 0.3 means we expect the font size to "
359 "be no smaller than 30% of the text line font size.",
362 "Maximum top of a character measured as a multiple of "
363 "x-height above the baseline for us to reconsider whether "
367 "Minimum bottom of a character measured as a multiple of "
368 "x-height above the baseline for us to reconsider whether "
369 "it's a superscript.",
372 "Write block separators in output", this->
params()),
392 "Create PDF with only one invisible text layer",
398 "Specify minimum characters to try during OSD",
401 "Output char for unidentified blobs", this->
params()),
404 "Don't suspect dict wds longer than this", this->
params()),
408 "Don't touch bad rating limit", this->
params()),
412 "Only reject tess failures", this->
params()),
416 "Make output have exactly one word per WERD", this->
params()),
418 "Don't reject ANYTHING AT ALL", this->
params()),
426 "Aspect ratio dot/hyphen test", this->
params()),
428 "Aspect ratio dot/hyphen test", this->
params()),
430 "Use DOC dawg in 11l conf. detector", this->
params()),
446 "if >this fract", this->
params()),
450 "Allow NN to unrej", this->
params()),
458 "-1 -> All pages, else specific page to process",
461 "Capture the image from the IPE", this->
params()),
468 "List of languages to load with this one", this->
params()),
470 "In multilingual mode use params model of the"
474 "Min acceptable orientation margin", this->
params()),
480 "Allow feature extractors to see the original outline",
483 "Only initialize with the config file. Useful if the "
484 "instance is not going to be used for OCR but say only "
485 "for layout analysis.",
490 "Enable vertical detection", this->
params()),
492 "Force using vertical text page mode", this->
params()),
495 "Fraction of textlines deemed vertical to use vertical page "
500 "Fraction of height used as a minimum gap for aligned blobs.",
505 "Preserve multiple interword spaces", this->
params()),
507 "Page separator (default is form feed control character)",
510 "Allows to include alternative symbols choices in the hOCR output. "
511 "Valid input values are 0, 1, 2 and 3. 0 is the default value. "
512 "With 1 the alternative symbol choices per timestep are included. "
513 "With 2 the alternative symbol choices are accumulated per "
517 "Detect music staff and remove intersecting components", this->
params()),
519 backup_config_file_(
nullptr),
520 pix_binary_(
nullptr),
522 pix_original_(
nullptr),
523 pix_thresholds_(
nullptr),
524 source_resolution_(0),
526 right_to_left_(
false),
527 scaled_color_(
nullptr),
531 most_recently_used_(
this),
533 equ_detect_(
nullptr),
534 #ifndef ANDROID_BUILD
535 lstm_recognizer_(
nullptr),
537 train_line_page_num_(0) {
◆ ~Tesseract()
tesseract::Tesseract::~Tesseract |
( |
| ) |
|
|
override |
Definition at line 540 of file tesseractclass.cpp.
542 pixDestroy(&pix_original_);
544 sub_langs_.delete_data_pointers();
545 #ifndef ANDROID_BUILD
546 delete lstm_recognizer_;
547 lstm_recognizer_ =
nullptr;
◆ acceptable_number_string()
bool tesseract::Tesseract::acceptable_number_string |
( |
const char * |
s, |
|
|
const char * |
lengths |
|
) |
| |
Definition at line 391 of file output.cpp.
401 else if (prev_digit &&
402 (*lengths == 1 && ((*s ==
'.') || (*s ==
',') || (*s ==
'-'))))
404 else if (prev_digit && *lengths == 1 &&
405 (*(s + *lengths) ==
'\0') && ((*s ==
'%') || (*s ==
')')))
407 else if (prev_digit &&
408 *lengths == 1 && (*s ==
'%') &&
409 (*(lengths + 1) == 1 && *(s + *lengths) ==
')') &&
410 (*(s + *lengths + *(lengths + 1)) ==
'\0'))
◆ acceptable_word_string()
Definition at line 1745 of file control.cpp.
1749 int leading_punct_count;
1750 int upper_count = 0;
1751 int hyphen_pos = -1;
1754 if (strlen (lengths) > 20)
1760 offset += lengths[i++];
1761 leading_punct_count = i;
1764 while (s[offset] !=
'\0' && char_set.
get_isupper(s + offset, lengths[i])) {
1765 offset += lengths[i++];
1768 if (upper_count > 1) {
1772 while (s[offset] !=
'\0' && char_set.
get_islower(s + offset, lengths[i])) {
1773 offset += lengths[i++];
1781 if (lengths[i] == 1 && s[offset] ==
'-') {
1783 offset += lengths[i++];
1784 if (s[offset] !=
'\0') {
1785 while ((s[offset] !=
'\0') &&
1787 offset += lengths[i++];
1789 if (i < hyphen_pos + 3)
1794 if (lengths[i] == 1 && (s[offset] ==
'\'') &&
1795 lengths[i + 1] == 1 && (s[offset + lengths[i]] ==
's')) {
1796 offset += lengths[i++];
1797 offset += lengths[i++];
1800 if (upper_count > 0)
1807 if (lengths[i] == 1 && s[offset] !=
'\0' &&
1809 offset += lengths[i++];
1810 if (lengths[i] == 1 && s[offset] !=
'\0' && i > 0 &&
1811 s[offset - lengths[i - 1]] != s[offset] &&
1813 offset += lengths[i++];
1815 if (s[offset] !=
'\0')
1824 if (s[0] !=
'\0' && char_set.
get_isupper(s, lengths[0])) {
1826 while (s[offset] !=
'\0' &&
1828 lengths[i + 1] == 1 && s[offset + lengths[i]] ==
'.') {
1829 offset += lengths[i++];
1830 offset += lengths[i++];
1833 else if (s[0] !=
'\0' && char_set.
get_islower(s, lengths[0])) {
1835 while (s[offset] !=
'\0' &&
1837 lengths[i + 1] == 1 && s[offset + lengths[i]] ==
'.') {
1838 offset += lengths[i++];
1839 offset += lengths[i++];
1842 if (s[offset] !=
'\0')
◆ alpha_count()
int16_t tesseract::Tesseract::alpha_count |
( |
const char * |
word, |
|
|
const char * |
word_lengths |
|
) |
| |
Definition at line 501 of file reject.cpp.
511 const char* word_lengths) {
◆ ambigs_classify_and_output()
void tesseract::Tesseract::ambigs_classify_and_output |
( |
const char * |
label, |
|
|
PAGE_RES_IT * |
pr_it, |
|
|
FILE * |
output_file |
|
) |
| |
Definition at line 211 of file recogtraining.cpp.
216 WordData word_data(*pr_it);
226 tprintf(
"Not outputting illegal unichar %s\n", label);
232 const auto** blob_choices =
new const BLOB_CHOICE*[dim];
235 delete[] blob_choices;
◆ AnyLSTMLang()
bool tesseract::Tesseract::AnyLSTMLang |
( |
| ) |
const |
|
inline |
Definition at line 295 of file tesseractclass.h.
298 for (
int i = 0; i < sub_langs_.size(); ++i) {
◆ AnyTessLang()
bool tesseract::Tesseract::AnyTessLang |
( |
| ) |
const |
|
inline |
Definition at line 285 of file tesseractclass.h.
288 for (
int i = 0; i < sub_langs_.size(); ++i) {
◆ ApplyBoxes()
PAGE_RES * tesseract::Tesseract::ApplyBoxes |
( |
const STRING & |
fname, |
|
|
bool |
find_segmentation, |
|
|
BLOCK_LIST * |
block_list |
|
) |
| |
Definition at line 109 of file applybox.cpp.
119 const int box_count = boxes.
size();
120 int box_failures = 0;
124 PAGE_RES* page_res = find_segmentation ?
126 clear_any_old_text(block_list);
128 for (
int i = 0; i < box_count; i++) {
129 bool foundit =
false;
130 if (page_res !=
nullptr) {
132 (i == 0) ?
nullptr : &boxes[i - 1],
134 (i == box_count - 1) ?
nullptr : &boxes[i + 1],
135 full_texts[i].
string());
138 (i == box_count - 1) ?
nullptr : &boxes[i + 1],
144 "FAILURE! Couldn't find a matching blob");
148 if (page_res ==
nullptr) {
156 tprintf(
" Boxes read from boxfile: %6d\n", box_count);
157 if (box_failures > 0)
158 tprintf(
" Boxes failed resegmentation: %6d\n", box_failures);
◆ ApplyBoxTraining()
void tesseract::Tesseract::ApplyBoxTraining |
( |
const STRING & |
fontname, |
|
|
PAGE_RES * |
page_res |
|
) |
| |
◆ AssignDiacriticsToNewBlobs()
Definition at line 1064 of file control.cpp.
1072 for (
int i = 0; i < outlines.
size(); ++i) {
1073 if (outlines[i] ==
nullptr)
continue;
1076 int num_blob_outlines = 0;
1077 TBOX total_ol_box(outlines[i]->bounding_box());
1078 while (i < outlines.
size() && outlines[i] !=
nullptr) {
1079 blob_wanted[i] =
true;
1080 total_ol_box += outlines[i]->bounding_box();
1082 ++num_blob_outlines;
1086 while (!blob_it.at_last() &&
1087 blob_it.data_relative(1)->bounding_box().left() <=
1088 total_ol_box.left()) {
1094 tprintf(
"Num blobless outlines = %d\n", num_blob_outlines);
1095 C_BLOB* left_blob = blob_it.data();
1097 C_BLOB* right_blob = blob_it.at_last() ? nullptr : blob_it.data_relative(1);
1098 if ((left_box.
x_overlap(total_ol_box) || right_blob ==
nullptr ||
1101 outlines, num_blob_outlines,
1104 for (
int j = 0; j < blob_wanted.
size(); ++j) {
1105 if (blob_wanted[j]) {
1106 (*word_wanted)[j] =
true;
1107 (*target_blobs)[j] = left_blob;
1110 }
else if (right_blob !=
nullptr &&
1114 right_blob, outlines,
1115 num_blob_outlines, &blob_wanted)) {
1117 for (
int j = 0; j < blob_wanted.
size(); ++j) {
1118 if (blob_wanted[j]) {
1119 (*word_wanted)[j] =
true;
1120 (*target_blobs)[j] = right_blob;
1124 outlines, num_blob_outlines,
1127 for (
int j = 0; j < blob_wanted.
size(); ++j) {
1128 if (blob_wanted[j]) {
1129 (*word_wanted)[j] =
true;
1130 (*target_blobs)[j] =
nullptr;
◆ AssignDiacriticsToOverlappingBlobs()
Definition at line 1011 of file control.cpp.
1025 for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1026 C_BLOB* blob = blob_it.data();
1029 int num_blob_outlines = 0;
1030 for (
int i = 0; i < outlines.
size(); ++i) {
1032 !(*word_wanted)[i]) {
1033 blob_wanted[i] =
true;
1034 (*overlapped_any_blob)[i] =
true;
1035 ++num_blob_outlines;
1039 tprintf(
"%d noise outlines overlap blob at:", num_blob_outlines);
1048 outlines, num_blob_outlines,
1050 for (
int i = 0; i < blob_wanted.
size(); ++i) {
1051 if (blob_wanted[i]) {
1053 (*word_wanted)[i] =
true;
1054 (*target_blobs)[i] = blob;
◆ AutoPageSeg()
int tesseract::Tesseract::AutoPageSeg |
( |
PageSegMode |
pageseg_mode, |
|
|
BLOCK_LIST * |
blocks, |
|
|
TO_BLOCK_LIST * |
to_blocks, |
|
|
BLOBNBOX_LIST * |
diacritic_blobs, |
|
|
Tesseract * |
osd_tess, |
|
|
OSResults * |
osr |
|
) |
| |
Auto page segmentation. Divide the page image into blocks of uniform text linespacing and images.
Resolution (in ppi) is derived from the input image.
The output goes in the blocks list with corresponding TO_BLOCKs in the to_blocks list.
If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide the image into columns, but multiple blocks are still made if the text is of non-uniform linespacing.
If diacritic_blobs is non-null, then diacritics/noise blobs, that would confuse layout analysis by causing textline overlap, are placed there, with the expectation that they will be reassigned to words later and noise/diacriticness determined via classification.
If osd (orientation and script detection) is true then that is performed as well. If only_osd is true, then only orientation and script detection is performed. If osd is desired, (osd or only_osd) then osr_tess must be another Tesseract that was initialized especially for osd, and the results will be output into osr (orientation and script result).
Definition at line 216 of file pagesegmain.cpp.
220 pixOr(photomask_pix, photomask_pix, musicmask_pix);
223 finder->SetEquationDetect(equ_detect_);
225 result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_,
226 to_block, photomask_pix, pix_thresholds_,
227 pix_grey_, &pixa_debug_, &found_blocks,
228 diacritic_blobs, to_blocks);
230 finder->GetDeskewVectors(&deskew_, &reskew_);
233 pixDestroy(&photomask_pix);
234 pixDestroy(&musicmask_pix);
235 if (result < 0)
return result;
238 BLOCK_IT block_it(blocks);
240 block_it.add_list_after(&found_blocks);
246 static void AddAllScriptsConverted(
const UNICHARSET& sid_set,
◆ BelievableSuperscript()
bool tesseract::Tesseract::BelievableSuperscript |
( |
bool |
debug, |
|
|
const WERD_RES & |
word, |
|
|
float |
certainty_threshold, |
|
|
int * |
left_ok, |
|
|
int * |
right_ok |
|
) |
| const |
Return whether this is believable superscript or subscript text.
We insist that:
- there are no punctuation marks.
- there are no italics.
- no normal-sized character is smaller than superscript_scaledown_ratio of what it ought to be, and
- each character is at least as certain as certainty_threshold.
- Parameters
-
[in] | debug | If true, spew debug output |
[in] | word | The word whose best_choice we're evaluating |
[in] | certainty_threshold | If any of the characters have less certainty than this, reject. |
[out] | left_ok | How many left-side characters were ok? |
[out] | right_ok | How many right-side characters were ok? |
- Returns
- Whether the complete best choice is believable as a superscript.
Definition at line 522 of file superscript.cpp.
527 int initial_ok_run_count = 0;
528 int ok_run_count = 0;
529 float worst_certainty = 0.0f;
533 for (
int i = 0; i < wc.
length(); i++) {
537 bool bad_certainty = char_certainty < certainty_threshold;
541 if (choice && fontinfo_table.
size() > 0) {
544 bool font1_is_italic = font_id1 >= 0
547 is_italic = font1_is_italic &&
548 (font_id2 < 0 || fontinfo_table.
get(font_id2).
is_italic());
551 float height_fraction = 1.0f;
553 float normal_height = char_height;
555 int min_bot, max_bot, min_top, max_top;
559 float hi_height = max_top - max_bot;
560 float lo_height = min_top - min_bot;
561 normal_height = (hi_height + lo_height) / 2;
565 height_fraction = char_height / normal_height;
572 tprintf(
" Rejecting: superscript is italic.\n");
575 tprintf(
" Rejecting: punctuation present.\n");
579 tprintf(
" Rejecting: don't believe character %s with certainty %.2f "
580 "which is less than threshold %.2f\n", char_str,
581 char_certainty, certainty_threshold);
584 tprintf(
" Rejecting: character %s seems too small @ %.2f versus "
585 "expected %.2f\n", char_str, char_height, normal_height);
588 if (bad_certainty || bad_height || is_punc || is_italic) {
589 if (ok_run_count == i) {
590 initial_ok_run_count = ok_run_count;
596 if (char_certainty < worst_certainty) {
597 worst_certainty = char_certainty;
600 bool all_ok = ok_run_count == wc.
length();
601 if (all_ok && debug) {
602 tprintf(
" Accept: worst revised certainty is %.2f\n", worst_certainty);
605 if (left_ok) *left_ok = initial_ok_run_count;
606 if (right_ok) *right_ok = ok_run_count;
◆ BestPix()
Pix* tesseract::Tesseract::BestPix |
( |
| ) |
const |
|
inline |
Definition at line 233 of file tesseractclass.h.
234 if (pixGetWidth(pix_original_) ==
ImageWidth()) {
235 return pix_original_;
236 }
else if (pix_grey_ !=
nullptr) {
◆ bigram_correction_pass()
void tesseract::Tesseract::bigram_correction_pass |
( |
PAGE_RES * |
page_res | ) |
|
Definition at line 467 of file control.cpp.
474 while (word_it.forward() !=
nullptr &&
475 (!word_it.word() || word_it.word()->part_of_combo)) {
478 if (!word_it.word())
break;
485 tprintf(
"Skipping because one of the words is W_REP_CHAR\n");
510 tprintf(
"Top choice \"%s %s\" verified by bigram model.\n",
516 tprintf(
"Examining alt choices for \"%s %s\".\n",
527 float best_rating = 0.0;
530 for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
539 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
550 if (overrides_word1.
size() == 1 ||
553 best_idx = overrides_word1.
size() - 1;
558 if (!overrides_word1.
empty()) {
561 *overrides_word1[best_idx]) &&
563 *overrides_word2[best_idx])) {
565 tprintf(
"Top choice \"%s %s\" verified (sans case) by bigram "
566 "model.\n", orig_w1_str.
string(), orig_w2_str.
string());
570 const STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
571 const STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
572 if (new_w1_str != orig_w1_str) {
575 if (new_w2_str != orig_w2_str) {
579 STRING choices_description;
580 int num_bigram_choices
581 = overrides_word1.
size() * overrides_word2.
size();
582 if (num_bigram_choices == 1) {
583 choices_description =
"This was the unique bigram choice.";
587 const int kMaxChoicesToPrint = 20;
588 for (
int i = 0; i < overrides_word1.
size() &&
589 i < kMaxChoicesToPrint; i++) {
590 if (i > 0) { bigrams_list +=
", "; }
595 choices_description =
"There were many choices: {";
596 choices_description += bigrams_list;
597 choices_description +=
"}";
599 choices_description.
add_str_int(
"There were ", num_bigram_choices);
600 choices_description +=
" compatible bigrams.";
603 tprintf(
"Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
606 choices_description.
string());
◆ blamer_pass()
void tesseract::Tesseract::blamer_pass |
( |
PAGE_RES * |
page_res | ) |
|
Definition at line 710 of file control.cpp.
713 for (page_res_it.restart_page(); page_res_it.word() !=
nullptr;
714 page_res_it.forward()) {
◆ blob_feature_display()
void tesseract::Tesseract::blob_feature_display |
( |
PAGE_RES * |
page_res, |
|
|
const TBOX & |
selection_box |
|
) |
| |
◆ blob_noise_score()
float tesseract::Tesseract::blob_noise_score |
( |
TBLOB * |
blob | ) |
|
Definition at line 787 of file fixspace.cpp.
789 int16_t outline_count = 0;
790 int16_t max_dimension;
791 int16_t largest_outline_dimension = 0;
795 box = ol->bounding_box();
797 max_dimension = box.
height();
799 max_dimension = box.
width();
802 if (largest_outline_dimension < max_dimension)
803 largest_outline_dimension = max_dimension;
806 if (outline_count > 5) {
808 largest_outline_dimension *= 2;
815 largest_outline_dimension /= 2;
818 return largest_outline_dimension;
◆ break_noisiest_blob_word()
void tesseract::Tesseract::break_noisiest_blob_word |
( |
WERD_RES_LIST & |
words | ) |
|
break_noisiest_blob_word() Find the word with the blob which looks like the worst noise. Break the word into two, deleting the noise blob.
Definition at line 642 of file fixspace.cpp.
643 WERD_RES_IT word_it(&words);
644 WERD_RES_IT worst_word_it;
645 float worst_noise_score = 9999;
646 int worst_blob_index = -1;
651 C_BLOB_IT rej_cblob_it;
652 C_BLOB_LIST new_blob_list;
653 C_BLOB_IT new_blob_it;
654 C_BLOB_IT new_rej_cblob_it;
656 int16_t start_of_noise_blob;
659 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
661 if (blob_index > -1 && worst_noise_score > noise_score) {
662 worst_noise_score = noise_score;
663 worst_blob_index = blob_index;
664 worst_word_it = word_it;
667 if (worst_blob_index < 0) {
674 word_res = worst_word_it.data();
678 new_blob_it.set_to_list(&new_blob_list);
680 for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
681 new_blob_it.add_after_then_move(blob_it.extract());
683 start_of_noise_blob = blob_it.data()->bounding_box().left();
684 delete blob_it.extract();
686 new_word =
new WERD(&new_blob_list, word_res->
word);
694 (!rej_cblob_it.empty() &&
695 (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
696 rej_cblob_it.forward()) {
697 new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
700 auto* new_word_res =
new WERD_RES(new_word);
701 new_word_res->combination =
true;
702 worst_word_it.add_before_then_move(new_word_res);
◆ build_menu_new()
SVMenuNode * tesseract::Tesseract::build_menu_new |
( |
| ) |
|
Definition at line 298 of file pgedit.cpp.
313 parent_menu = root_menu_item->
AddChild(
"DISPLAY");
332 parent_menu = root_menu_item->
AddChild(
"OTHER");
341 return root_menu_item;
◆ check_debug_pt()
bool tesseract::Tesseract::check_debug_pt |
( |
WERD_RES * |
word, |
|
|
int |
location |
|
) |
| |
Definition at line 1849 of file control.cpp.
1850 bool show_map_detail =
false;
1867 tprintf (
"classify_word_pass1 start\n");
1871 tprintf (
"make_reject_map: initial map");
1874 tprintf (
"make_reject_map: after NN");
1877 tprintf (
"classify_word_pass2 - START");
1880 tprintf (
"classify_word_pass2 - Pre Xht");
1883 tprintf (
"classify_word_pass2 - END");
1884 show_map_detail =
true;
1896 tprintf (
"After Poor quality rejection");
1899 tprintf (
"unrej_good_quality_words - START");
1902 tprintf (
"unrej_good_quality_words - END");
1905 tprintf (
"Write results pass");
1906 show_map_detail =
true;
1913 if (show_map_detail) {
1921 tprintf(
"null best choice\n");
1924 tprintf (
"Done flag: %s\n\n", word->
done ?
"TRUE" :
"FALSE");
◆ classify_word_and_language()
void tesseract::Tesseract::classify_word_and_language |
( |
int |
pass_n, |
|
|
PAGE_RES_IT * |
pr_it, |
|
|
WordData * |
word_data |
|
) |
| |
Definition at line 1319 of file control.cpp.
1321 #ifdef DISABLED_LEGACY_ENGINE
1326 #endif // def DISABLED_LEGACY_ENGINE
1329 PointerVector<WERD_RES> best_words;
1332 clock_t start_t = clock();
1335 tprintf(
"%s word with lang %s at:",
1336 word->
done ?
"Already done" :
"Processing",
1346 int sub = sub_langs_.size();
1347 if (most_recently_used_ !=
this) {
1349 for (sub = 0; sub < sub_langs_.size() &&
1350 most_recently_used_ != sub_langs_[sub]; ++sub) {}
1353 *word_data, recognizer, debug, &word_data->lang_words[sub], &best_words);
1354 Tesseract* best_lang_tess = most_recently_used_;
1355 if (!WordsAcceptable(best_words)) {
1357 if (most_recently_used_ !=
this &&
1359 &word_data->lang_words[sub_langs_.size()],
1361 best_lang_tess =
this;
1363 for (
int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
1365 if (most_recently_used_ != sub_langs_[i] &&
1367 &word_data->lang_words[i],
1369 best_lang_tess = sub_langs_[i];
1373 most_recently_used_ = best_lang_tess;
1374 if (!best_words.empty()) {
1375 if (best_words.size() == 1 && !best_words[0]->combination) {
1377 word_data->word->ConsumeWordResults(best_words[0]);
1380 word_data->word = best_words.back();
1383 ASSERT_HOST(word_data->word->box_word !=
nullptr);
1387 clock_t ocr_t = clock();
1389 tprintf(
"%s (ocr took %.2f sec)\n",
1390 word_data->word->best_choice->unichar_string().string(),
1391 static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
◆ classify_word_pass1()
classify_word_pass1
Baseline normalize the word and pass it to Tess.
Definition at line 1401 of file control.cpp.
1404 ROW* row = word_data.row;
1405 BLOCK* block = word_data.block;
1407 ? word_data.prev_word->word->best_choice :
nullptr;
1408 #ifndef ANDROID_BUILD
1409 #ifdef DISABLED_LEGACY_ENGINE
1414 #endif // def DISABLED_LEGACY_ENGINE
1417 if (!out_words->
empty())
1426 #ifndef DISABLED_LEGACY_ENGINE
1433 #endif // ndef DISABLED_LEGACY_ENGINE
1435 #endif // ndef ANDROID_BUILD
1437 #ifndef DISABLED_LEGACY_ENGINE
1458 #endif // ndef DISABLED_LEGACY_ENGINE
◆ classify_word_pass2()
classify_word_pass2
Control what to do with the word in pass 2
Definition at line 1572 of file control.cpp.
1579 #ifndef DISABLED_LEGACY_ENGINE
1580 ROW* row = word_data.row;
1581 BLOCK* block = word_data.block;
1584 ? word_data.prev_word->word->best_choice :
nullptr;
1607 #ifndef GRAPHICS_DISABLED
1621 #endif // ndef DISABLED_LEGACY_ENGINE
◆ ClassifyBlobAsWord()
float tesseract::Tesseract::ClassifyBlobAsWord |
( |
int |
pass_n, |
|
|
PAGE_RES_IT * |
pr_it, |
|
|
C_BLOB * |
blob, |
|
|
STRING * |
best_str, |
|
|
float * |
c2 |
|
) |
| |
Definition at line 1270 of file control.cpp.
1278 while (it.word() != word_res && it.word() !=
nullptr) it.forward();
1285 if (wd.word->raw_choice !=
nullptr) {
1286 tprintf(
"word xheight=%g, row=%g, range=[%g,%g]\n", word_res->
x_height,
1287 wd.row->x_height(), wd.word->raw_choice->min_x_height(),
1288 wd.word->raw_choice->max_x_height());
1290 tprintf(
"Got word with null raw choice xheight=%g, row=%g\n", word_res->
x_height,
1291 wd.row->x_height());
1295 if (wd.word->raw_choice !=
nullptr) {
1296 cert = wd.word->raw_choice->certainty();
1297 float rat = wd.word->raw_choice->rating();
1298 *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1299 *best_str = wd.word->raw_choice->unichar_string();
1304 it.DeleteCurrentWord();
◆ ClassifyBlobPlusOutlines()
Definition at line 1226 of file control.cpp.
1232 C_BLOB* local_blob =
nullptr;
1233 if (blob !=
nullptr) {
1235 ol_it.set_to_list(blob->
out_list());
1236 first_to_keep = ol_it.data();
1238 for (
int i = 0; i < ok_outlines.
size(); ++i) {
1239 if (ok_outlines[i]) {
1241 if (blob ==
nullptr) {
1242 local_blob =
new C_BLOB(outlines[i]);
1244 ol_it.set_to_list(blob->
out_list());
1246 ol_it.add_before_stay_put(outlines[i]);
1252 ol_it.move_to_first();
1253 if (first_to_keep ==
nullptr) {
1255 for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
1260 for (; ol_it.data() != first_to_keep; ol_it.forward()) {
◆ Clear()
void tesseract::Tesseract::Clear |
( |
| ) |
|
Definition at line 564 of file tesseractclass.cpp.
567 pixDestroy(&pix_binary_);
568 pixDestroy(&pix_grey_);
569 pixDestroy(&pix_thresholds_);
570 pixDestroy(&scaled_color_);
571 deskew_ =
FCOORD(1.0f, 0.0f);
572 reskew_ =
FCOORD(1.0f, 0.0f);
575 for (
int i = 0; i < sub_langs_.size(); ++i)
576 sub_langs_[i]->
Clear();
◆ ComputeCompatibleXheight()
float tesseract::Tesseract::ComputeCompatibleXheight |
( |
WERD_RES * |
word_res, |
|
|
float * |
baseline_shift |
|
) |
| |
Definition at line 119 of file fixxht.cpp.
131 tprintf(
"Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
133 height, min_bottom, max_bottom, min_top, max_top,
150 tprintf(
" xht range min=%d, max=%d\n", min_xht, max_xht);
154 for (
int y = min_xht; y <= max_xht; ++y)
155 top_stats.add(y, misfit_dist);
160 int min_shift = min_bottom - bottom;
161 int max_shift = max_bottom - bottom;
163 tprintf(
" bottom shift min=%d, max=%d\n", min_shift, max_shift);
168 int misfit_weight = abs(min_shift);
169 if (max_shift > min_shift)
170 misfit_weight /= max_shift - min_shift;
171 for (
int y = min_shift; y <= max_shift; ++y)
172 shift_stats.add(y, misfit_weight);
174 if (bottom_shift == 0) {
185 if (shift_stats.get_total() > top_stats.get_total()) {
188 tprintf(
"Applying bottom shift=%d\n", bottom_shift);
191 }
while (bottom_shift != 0 &&
192 top_stats.get_total() < shift_stats.get_total());
194 *baseline_shift = -bottom_shift / word_res->
denorm.
y_scale();
196 tprintf(
"baseline shift=%g\n", *baseline_shift);
198 if (top_stats.get_total() == 0)
199 return bottom_shift != 0 ? word_res->
x_height : 0.0f;
202 float new_xht = top_stats.median();
204 tprintf(
"Median xht=%f\n", new_xht);
205 tprintf(
"Mode20:A: New x-height = %f (norm), %f (orig)\n",
212 return bottom_shift != 0 ? word_res->
x_height : 0.0f;
◆ convert_bad_unlv_chs()
void tesseract::Tesseract::convert_bad_unlv_chs |
( |
WERD_RES * |
word_res | ) |
|
◆ ConvertStringToUnichars()
◆ CorrectClassifyWords()
void tesseract::Tesseract::CorrectClassifyWords |
( |
PAGE_RES * |
page_res | ) |
|
◆ count_alphanums() [1/2]
int16_t tesseract::Tesseract::count_alphanums |
( |
const WERD_CHOICE & |
word | ) |
|
◆ count_alphanums() [2/2]
int16_t tesseract::Tesseract::count_alphanums |
( |
WERD_RES * |
word | ) |
|
◆ count_alphas()
int16_t tesseract::Tesseract::count_alphas |
( |
const WERD_CHOICE & |
word | ) |
|
◆ count_outline_errs()
int16_t tesseract::Tesseract::count_outline_errs |
( |
char |
c, |
|
|
int16_t |
outline_count |
|
) |
| |
◆ CountMisfitTops()
int tesseract::Tesseract::CountMisfitTops |
( |
WERD_RES * |
word_res | ) |
|
Definition at line 87 of file fixxht.cpp.
90 tprintf(
"Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
92 bad ?
"Misfit" :
"OK", top, min_top, max_top,
103 float* baseline_shift) {
104 STATS top_stats(0, UINT8_MAX);
105 STATS shift_stats(-UINT8_MAX, UINT8_MAX);
106 int bottom_shift = 0;
111 for (
int blob_id = 0; blob_id < num_blobs; ++blob_id) {
◆ debug_word()
void tesseract::Tesseract::debug_word |
( |
PAGE_RES * |
page_res, |
|
|
const TBOX & |
selection_box |
|
) |
| |
debug_word
Process the whole image, but load word_config_ for the selected word(s).
Definition at line 665 of file pgedit.cpp.
666 #ifndef DISABLED_LEGACY_ENGINE
◆ dictionary_correction_pass()
void tesseract::Tesseract::dictionary_correction_pass |
( |
PAGE_RES * |
page_res | ) |
|
Definition at line 2093 of file control.cpp.
2096 word = word_it.forward()) {
2097 if (word->best_choices.singleton())
2101 if (word->tesseract->getDict().valid_word(*best) != 0)
2104 WERD_CHOICE_IT choice_it(&word->best_choices);
2105 for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
2106 choice_it.forward()) {
2108 if (word->tesseract->getDict().valid_word(*alternate)) {
2111 tprintf(
"Dictionary correction replaces best choice '%s' with '%s'\n",
2116 word->ReplaceBestChoice(alternate);
◆ digit_or_numeric_punct()
bool tesseract::Tesseract::digit_or_numeric_punct |
( |
WERD_RES * |
word, |
|
|
int |
char_position |
|
) |
| |
Definition at line 370 of file fixspace.cpp.
374 for (i = 0, offset = 0; i < char_position;
◆ do_re_display()
void tesseract::Tesseract::do_re_display |
( |
bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it) |
word_painter | ) |
|
do_re_display()
Redisplay page
Definition at line 349 of file pgedit.cpp.
355 image_win->
Image(pix_binary_, 0, 0);
360 for (
WERD_RES* word = pr_it.
word(); word !=
nullptr; word = pr_it.forward()) {
361 (this->*word_painter)(&pr_it);
362 if (display_baselines && pr_it.row() != pr_it.prev_row())
364 if (display_blocks && pr_it.block() != pr_it.prev_block())
365 pr_it.block()->block->pdblk.plot(image_win, block_count++,
ScrollView::RED);
◆ doc_and_block_rejection()
void tesseract::Tesseract::doc_and_block_rejection |
( |
PAGE_RES_IT & |
page_res_it, |
|
|
bool |
good_quality_doc |
|
) |
| |
Definition at line 249 of file docqual.cpp.
249 : %d #Rejects: %d; \n
",
250 page_res_it.page_res->char_count,
251 page_res_it.page_res->rej_count);
254 if (tessedit_debug_doc_rejection) {
255 tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n
",
256 page_res_it.page_res->char_count,
257 page_res_it.page_res->rej_count);
260 /* Walk blocks testing for block rejection */
262 page_res_it.restart_page();
264 while ((word = page_res_it.word()) != nullptr) {
265 current_block = page_res_it.block();
266 block_no = current_block->block->pdblk.index();
267 if (current_block->char_count > 0 &&
268 (current_block->rej_count * 100.0 / current_block->char_count) >
269 tessedit_reject_block_percent) {
270 if (tessedit_debug_block_rejection) {
271 tprintf("REJECTING
BLOCK %d #chars: %d; #Rejects: %d\n
",
272 block_no, current_block->char_count,
273 current_block->rej_count);
275 prev_word_rejected = false;
276 while ((word = page_res_it.word()) != nullptr &&
277 (page_res_it.block() == current_block)) {
278 if (tessedit_preserve_blk_rej_perfect_wds) {
279 rej_word = word->reject_map.reject_count() > 0 ||
280 word->reject_map.length () < tessedit_preserve_min_wd_len;
281 if (rej_word && tessedit_dont_blkrej_good_wds &&
282 word->reject_map.length() >= tessedit_preserve_min_wd_len &&
283 acceptable_word_string(
285 word->best_choice->unichar_string().string(),
286 word->best_choice->unichar_lengths().string()) !=
288 word_char_quality(word, page_res_it.row()->row,
290 &accepted_char_quality);
291 rej_word = char_quality != word->reject_map.length();
298 Reject spacing if both current and prev words are rejected.
299 NOTE - this is NOT restricted to FUZZY spaces. - When tried this
300 generated more space errors.
302 if (tessedit_use_reject_spaces &&
303 prev_word_rejected &&
304 page_res_it.prev_row() == page_res_it.row() &&
305 word->word->space() == 1)
306 word->reject_spaces = true;
307 word->reject_map.rej_word_block_rej();
309 prev_word_rejected = rej_word;
310 page_res_it.forward();
313 if (tessedit_debug_block_rejection) {
314 tprintf("NOT REJECTING
BLOCK %d #chars: %d # Rejects: %d; \n
",
315 block_no, page_res_it.block()->char_count,
316 page_res_it.block()->rej_count);
319 /* Walk rows in block testing for row rejection */
321 while (page_res_it.word() != nullptr &&
322 page_res_it.block() == current_block) {
323 current_row = page_res_it.row();
325 /* Reject whole row if:
326 fraction of chars on row which are rejected exceed a limit AND
327 fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
330 if (current_row->char_count > 0 &&
331 (current_row->rej_count * 100.0 / current_row->char_count) >
332 tessedit_reject_row_percent &&
333 (current_row->whole_word_rej_count * 100.0 /
334 current_row->rej_count) <
335 tessedit_whole_wd_rej_row_percent) {
336 if (tessedit_debug_block_rejection) {
337 tprintf("REJECTING
ROW %d #chars: %d; #Rejects: %d\n
",
338 row_no, current_row->char_count,
339 current_row->rej_count);
341 prev_word_rejected = false;
342 while ((word = page_res_it.word()) != nullptr &&
343 page_res_it.row () == current_row) {
344 /* Preserve words on good docs unless they are mostly rejected*/
345 if (!tessedit_row_rej_good_docs && good_quality_doc) {
346 rej_word = word->reject_map.reject_count() /
347 static_cast<float>(word->reject_map.length()) >
348 tessedit_good_doc_still_rowrej_wd;
349 } else if (tessedit_preserve_row_rej_perfect_wds) {
350 /* Preserve perfect words anyway */
351 rej_word = word->reject_map.reject_count() > 0 ||
352 word->reject_map.length () < tessedit_preserve_min_wd_len;
353 if (rej_word && tessedit_dont_rowrej_good_wds &&
354 word->reject_map.length() >= tessedit_preserve_min_wd_len &&
355 acceptable_word_string(*word->uch_set,
356 word->best_choice->unichar_string().string(),
357 word->best_choice->unichar_lengths().string()) !=
359 word_char_quality(word, page_res_it.row()->row,
361 &accepted_char_quality);
362 rej_word = char_quality != word->reject_map.length();
369 Reject spacing if both current and prev words are rejected.
370 NOTE - this is NOT restricted to FUZZY spaces. - When tried
371 this generated more space errors.
373 if (tessedit_use_reject_spaces &&
374 prev_word_rejected &&
375 page_res_it.prev_row() == page_res_it.row() &&
376 word->word->space () == 1)
377 word->reject_spaces = true;
378 word->reject_map.rej_word_row_rej();
380 prev_word_rejected = rej_word;
381 page_res_it.forward();
384 if (tessedit_debug_block_rejection) {
385 tprintf("NOT REJECTING
ROW %d #chars: %d # Rejects: %d; \n
",
386 row_no, current_row->char_count, current_row->rej_count);
388 while (page_res_it.word() != nullptr &&
389 page_res_it.row() == current_row)
390 page_res_it.forward();
398 } // namespace tesseract
406 void reject_whole_page(PAGE_RES_IT &page_res_it) {
407 page_res_it.restart_page ();
408 while (page_res_it.word () != nullptr) {
409 page_res_it.word ()->reject_map.rej_word_doc_rej ();
410 page_res_it.forward ();
412 //whole page is rejected
413 page_res_it.page_res->rejected = true;
◆ dont_allow_1Il()
void tesseract::Tesseract::dont_allow_1Il |
( |
WERD_RES * |
word | ) |
|
Definition at line 532 of file reject.cpp.
550 for (i = 0, offset = 0; i < word_len;
◆ dump_words()
void tesseract::Tesseract::dump_words |
( |
WERD_RES_LIST & |
perm, |
|
|
int16_t |
score, |
|
|
int16_t |
mode, |
|
|
bool |
improved |
|
) |
| |
Definition at line 476 of file fixspace.cpp.
478 WERD_RES_IT word_res_it(&perm);
483 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
484 word_res_it.forward()) {
485 if (!word_res_it.data()->part_of_combo) {
487 word_res_it.data()->best_choice->unichar_string();
496 tprintf(
"EXTRACTED (%d): \"", score);
499 tprintf(
"TESTED (%d): \"", score);
502 tprintf(
"RETURNED (%d): \"", score);
506 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
507 word_res_it.forward()) {
508 if (!word_res_it.data()->part_of_combo) {
510 word_res_it.data()->best_choice->unichar_string().string(),
511 static_cast<int>(word_res_it.data()->best_choice->permuter()));
515 }
else if (improved) {
517 for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
518 word_res_it.forward()) {
519 if (!word_res_it.data()->part_of_combo) {
521 word_res_it.data()->best_choice->unichar_string().string(),
522 static_cast<int>(word_res_it.data()->best_choice->permuter()));
◆ end_tesseract()
void tesseract::Tesseract::end_tesseract |
( |
| ) |
|
◆ eval_word_spacing()
int16_t tesseract::Tesseract::eval_word_spacing |
( |
WERD_RES_LIST & |
word_res_list | ) |
|
Definition at line 266 of file fixspace.cpp.
267 WERD_RES_IT word_res_it(&word_res_list);
268 int16_t total_score = 0;
269 int16_t word_count = 0;
270 int16_t done_word_count = 0;
275 int16_t prev_word_score = 0;
276 bool prev_word_done =
false;
277 bool prev_char_1 =
false;
278 bool prev_char_digit =
false;
279 bool current_char_1 =
false;
280 bool current_word_ok_so_far;
281 STRING punct_chars =
"!\"`',.:;";
282 bool prev_char_punct =
false;
283 bool current_char_punct =
false;
284 bool word_done =
false;
287 word = word_res_it.data();
291 total_score += prev_word_score;
296 prev_char_digit =
false;
297 prev_word_done =
false;
305 current_word_ok_so_far =
false;
307 (prev_char_digit && (
313 total_score += prev_word_score;
316 current_word_ok_so_far = word_done;
319 if (current_word_ok_so_far) {
320 prev_word_done =
true;
321 prev_word_score = word_len;
323 prev_word_done =
false;
329 for (i = 0, prev_char_1 =
false; i < word_len; i++) {
331 if (prev_char_1 || (current_char_1 && (i > 0)))
333 prev_char_1 = current_char_1;
339 for (i = 0, offset = 0, prev_char_punct =
false; i < word_len;
343 if (prev_char_punct || (current_char_punct && i > 0))
345 prev_char_punct = current_char_punct;
349 for (i = 0, offset = 0; i < word_len - 1;
358 word_res_it.forward();
359 }
while (word_res_it.data()->part_of_combo);
360 }
while (!word_res_it.at_first());
361 total_score += prev_word_score;
364 if (done_word_count == word_count)
◆ failure_count()
int16_t tesseract::Tesseract::failure_count |
( |
WERD_RES * |
word | ) |
|
Definition at line 968 of file docqual.cpp.
972 for (; *str !=
'\0'; str++) {
◆ FindSegmentation()
◆ first_alphanum_index()
int16_t tesseract::Tesseract::first_alphanum_index |
( |
const char * |
word, |
|
|
const char * |
word_lengths |
|
) |
| |
Definition at line 475 of file reject.cpp.
484 const char *word_lengths) {
◆ first_alphanum_offset()
int16_t tesseract::Tesseract::first_alphanum_offset |
( |
const char * |
word, |
|
|
const char * |
word_lengths |
|
) |
| |
Definition at line 488 of file reject.cpp.
497 const char *word_lengths) {
◆ fix_fuzzy_space_list()
void tesseract::Tesseract::fix_fuzzy_space_list |
( |
WERD_RES_LIST & |
best_perm, |
|
|
ROW * |
row, |
|
|
BLOCK * |
block |
|
) |
| |
Definition at line 172 of file fixspace.cpp.
176 WERD_RES_LIST current_perm;
177 int16_t current_score;
178 bool improved =
false;
181 dump_words(best_perm, best_score, 1, improved);
186 while ((best_score !=
PERFECT_WERDS) && !current_perm.empty()) {
189 dump_words(current_perm, current_score, 2, improved);
190 if (current_score > best_score) {
193 best_score = current_score;
199 dump_words(best_perm, best_score, 3, improved);
◆ fix_fuzzy_spaces()
void tesseract::Tesseract::fix_fuzzy_spaces |
( |
ETEXT_DESC * |
monitor, |
|
|
int32_t |
word_count, |
|
|
PAGE_RES * |
page_res |
|
) |
| |
Definition at line 75 of file fixspace.cpp.
78 BLOCK_RES_IT block_res_it;
79 ROW_RES_IT row_res_it;
80 WERD_RES_IT word_res_it_from;
81 WERD_RES_IT word_res_it_to;
83 WERD_RES_LIST fuzzy_space_words;
85 bool prevent_null_wd_fixsp;
90 for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
91 block_res_it.forward()) {
92 row_res_it.set_to_list(&block_res_it.data()->row_res_list);
93 for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
94 row_res_it.forward()) {
95 word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
96 while (!word_res_it_from.at_last()) {
97 word_res = word_res_it_from.data();
98 while (!word_res_it_from.at_last() &&
100 word_res_it_from.data_relative(1)->word->flag(
W_FUZZY_NON) ||
101 word_res_it_from.data_relative(1)->word->flag(
W_FUZZY_SP))) {
103 block_res_it.data()->block);
104 word_res = word_res_it_from.forward();
106 if (monitor !=
nullptr) {
108 monitor->
progress = 90 + 5 * word_index / word_count;
110 (monitor->
cancel !=
nullptr &&
116 if (!word_res_it_from.at_last()) {
117 word_res_it_to = word_res_it_from;
118 prevent_null_wd_fixsp =
122 word_res_it_to.forward();
124 if (monitor !=
nullptr) {
126 monitor->
progress = 90 + 5 * word_index / word_count;
128 (monitor->
cancel !=
nullptr &&
132 while (!word_res_it_to.at_last () &&
133 (word_res_it_to.data_relative(1)->word->flag(
W_FUZZY_NON) ||
134 word_res_it_to.data_relative(1)->word->flag(
W_FUZZY_SP))) {
138 prevent_null_wd_fixsp =
true;
139 word_res = word_res_it_to.forward();
144 prevent_null_wd_fixsp =
true;
145 if (prevent_null_wd_fixsp) {
146 word_res_it_from = word_res_it_to;
148 fuzzy_space_words.assign_to_sublist(&word_res_it_from,
151 row_res_it.data()->row,
152 block_res_it.data()->block);
153 new_length = fuzzy_space_words.length();
154 word_res_it_from.add_list_before(&fuzzy_space_words);
156 !word_res_it_from.at_last() && new_length > 0;
158 word_res_it_from.forward();
165 block_res_it.data()->block);
◆ fix_noisy_space_list()
void tesseract::Tesseract::fix_noisy_space_list |
( |
WERD_RES_LIST & |
best_perm, |
|
|
ROW * |
row, |
|
|
BLOCK * |
block |
|
) |
| |
Definition at line 596 of file fixspace.cpp.
599 WERD_RES_IT best_perm_it(&best_perm);
600 WERD_RES_LIST current_perm;
601 WERD_RES_IT current_perm_it(¤t_perm);
603 int16_t current_score;
604 bool improved =
false;
608 dump_words(best_perm, best_score, 1, improved);
610 old_word_res = best_perm_it.data();
619 while (best_score !=
PERFECT_WERDS && !current_perm.empty()) {
622 dump_words(current_perm, current_score, 2, improved);
623 if (current_score > best_score) {
626 best_score = current_score;
633 dump_words(best_perm, best_score, 3, improved);
◆ fix_rep_char()
void tesseract::Tesseract::fix_rep_char |
( |
PAGE_RES_IT * |
page_res_it | ) |
|
fix_rep_char() The word is a repeated char. (Leader.) Find the repeated char character. Create the appropriate single-word or multi-word sequence according to the size of spaces in between blobs, and correct the classifications where some of the characters disagree with the majority.
Definition at line 1706 of file control.cpp.
1712 for (
int i = 0; i < word.
length(); ++i) {
1718 int max_count = rep_ch.MaxCount(&maxch_id);
1720 BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
1721 if (best_choice ==
nullptr) {
1722 tprintf(
"Failed to find a choice for %s, occurring %d times\n",
1726 word_res->
done =
true;
1732 C_BLOB* prev_blob = blob_it.data();
1733 for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1734 C_BLOB* blob = blob_it.data();
1741 CorrectRepcharChoices(best_choice, word_res);
◆ fix_sp_fp_word()
void tesseract::Tesseract::fix_sp_fp_word |
( |
WERD_RES_IT & |
word_res_it, |
|
|
ROW * |
row, |
|
|
BLOCK * |
block |
|
) |
| |
Definition at line 562 of file fixspace.cpp.
565 WERD_RES_LIST sub_word_list;
566 WERD_RES_IT sub_word_list_it(&sub_word_list);
571 word_res = word_res_it.data();
583 tprintf(
"FP fixspace working on \"%s\"\n",
587 sub_word_list_it.add_after_stay_put(word_res_it.extract());
589 new_length = sub_word_list.length();
590 word_res_it.add_list_before(&sub_word_list);
591 for (; !word_res_it.at_last() && new_length > 1; new_length--) {
592 word_res_it.forward();
◆ fixspace_thinks_word_done()
bool tesseract::Tesseract::fixspace_thinks_word_done |
( |
WERD_RES * |
word | ) |
|
◆ flip_0O()
void tesseract::Tesseract::flip_0O |
( |
WERD_RES * |
word | ) |
|
Definition at line 679 of file reject.cpp.
684 TBLOB* blob = word_res->rebuild_word->blobs[i];
685 if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
686 word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
693 UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id(
"0");
694 UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id(
"O");
695 if (unichar_0 == INVALID_UNICHAR_ID ||
696 !word_res->uch_set->get_enabled(unichar_0) ||
697 unichar_O == INVALID_UNICHAR_ID ||
698 !word_res->uch_set->get_enabled(unichar_O)) {
701 for (i = 1; i < best_choice->length(); ++i) {
702 if (best_choice->unichar_id(i) == unichar_0 ||
703 best_choice->unichar_id(i) == unichar_O) {
705 if ((i+1) < best_choice->length() &&
706 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
707 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
708 best_choice->set_unichar_id(unichar_O, i);
711 if (
non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
712 (i+1) < best_choice->length() &&
713 (best_choice->unichar_id(i+1) == unichar_0 ||
714 best_choice->unichar_id(i+1) == unichar_O) &&
715 (i+2) < best_choice->length() &&
716 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
717 best_choice->set_unichar_id(unichar_O, i);
722 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
723 non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
724 (((i+1) < best_choice->length() &&
725 !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
726 !word_res->uch_set->eq(best_choice->unichar_id(i+1),
"l") &&
727 !word_res->uch_set->eq(best_choice->unichar_id(i+1),
"I")) ||
728 (i == best_choice->length() - 1))) {
729 best_choice->set_unichar_id(unichar_O, i);
732 if (
non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
733 (i+1) < best_choice->length() &&
734 non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
735 best_choice->set_unichar_id(unichar_0, i);
738 if (
non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
739 (i+2) < best_choice->length() &&
740 (best_choice->unichar_id(i+1) == unichar_0 ||
741 best_choice->unichar_id(i+1) == unichar_O) &&
742 (best_choice->unichar_id(i+2) == unichar_0 ||
743 best_choice->unichar_id(i+2) == unichar_O)) {
744 best_choice->set_unichar_id(unichar_0, i);
745 best_choice->set_unichar_id(unichar_0, i+1);
746 best_choice->set_unichar_id(unichar_0, i+2);
750 if (
non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
751 (i+2) < best_choice->length() &&
752 (best_choice->unichar_id(i+1) == unichar_0 ||
753 best_choice->unichar_id(i+1) == unichar_O) &&
754 !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
755 best_choice->set_unichar_id(unichar_0, i);
756 best_choice->set_unichar_id(unichar_0, i+1);
760 if (
non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
761 (i+1) < best_choice->length() &&
762 !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
763 best_choice->set_unichar_id(unichar_0, i);
767 (word_res->uch_set->eq(best_choice->unichar_id(i-1),
".") ||
768 word_res->uch_set->eq(best_choice->unichar_id(i-1),
",")) &&
769 (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
770 best_choice->unichar_id(i-2) == unichar_O)) {
771 if (best_choice->unichar_id(i-2) == unichar_O) {
772 best_choice->set_unichar_id(unichar_0, i-2);
774 while (i < best_choice->length() &&
775 (best_choice->unichar_id(i) == unichar_O ||
776 best_choice->unichar_id(i) == unichar_0)) {
777 best_choice->set_unichar_id(unichar_0, i);
787 return ch_set.
get_isupper(unichar_id) && !ch_set.
eq(unichar_id,
"O");
◆ flip_hyphens()
void tesseract::Tesseract::flip_hyphens |
( |
WERD_RES * |
word | ) |
|
Definition at line 622 of file reject.cpp.
631 TBLOB* blob = word_res->rebuild_word->blobs[i];
633 if (i + 1 == num_blobs)
636 next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().
left();
638 if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
639 (out_box.left() > prev_right) && (out_box.right() < next_left)) {
640 aspect_ratio = out_box.width() /
static_cast<float>(out_box.height());
641 if (word_res->uch_set->eq(best_choice->unichar_id(i),
".")) {
643 word_res->uch_set->contains_unichar_id(unichar_dash) &&
644 word_res->uch_set->get_enabled(unichar_dash)) {
646 best_choice->set_unichar_id(unichar_dash, i);
647 if (word_res->reject_map[i].rejected())
648 word_res->reject_map[i].setrej_hyphen_accept();
651 word_res->reject_map[i].accepted())
653 word_res->reject_map[i].setrej_hyphen ();
655 else if (best_choice->unichar_id(i) == unichar_dash) {
657 (word_res->reject_map[i].rejected()))
658 word_res->reject_map[i].setrej_hyphen_accept();
662 (word_res->reject_map[i].accepted()))
664 word_res->reject_map[i].setrej_hyphen();
667 prev_right = out_box.right();
◆ font_recognition_pass()
void tesseract::Tesseract::font_recognition_pass |
( |
PAGE_RES * |
page_res | ) |
|
font_recognition_pass
Smooth the fonts for the document.
Definition at line 2037 of file control.cpp.
2040 STATS doc_fonts(0, font_table_size_);
2043 for (page_res_it.restart_page(); page_res_it.word() !=
nullptr;
2044 page_res_it.forward()) {
2045 word = page_res_it.
word();
2054 int8_t doc_font_count;
2055 find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2056 if (doc_font_count == 0)
2059 const FontInfo* modal_font =
nullptr;
2060 for (page_res_it.restart_page(); page_res_it.word() !=
nullptr;
2061 page_res_it.forward()) {
2062 word = page_res_it.
word();
2075 for (page_res_it.restart_page(); page_res_it.word() !=
nullptr;
2076 page_res_it.forward()) {
2077 word = page_res_it.
word();
2081 if (!(
count == length || (length > 3 &&
count >= length * 3 / 4))) {
◆ fp_eval_word_spacing()
int16_t tesseract::Tesseract::fp_eval_word_spacing |
( |
WERD_RES_LIST & |
word_res_list | ) |
|
Definition at line 857 of file fixspace.cpp.
858 WERD_RES_IT word_it(&word_res_list);
864 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
865 word = word_it.data();
◆ garbage_word()
Definition at line 679 of file docqual.cpp.
694 int isolated_digits = 0;
695 int isolated_alphas = 0;
696 int bad_char_count = 0;
701 int alpha_repetition_count = 0;
702 int longest_alpha_repetition_count = 0;
703 int longest_lower_run_len = 0;
704 int lower_string_count = 0;
705 int longest_upper_run_len = 0;
706 int upper_string_count = 0;
707 int total_alpha_count = 0;
708 int total_digit_count = 0;
710 for (; *str !=
'\0'; str += *(lengths++)) {
715 case SUBSEQUENT_UPPER:
717 state = SUBSEQUENT_UPPER;
718 upper_string_count++;
719 if (longest_upper_run_len < upper_string_count)
720 longest_upper_run_len = upper_string_count;
722 alpha_repetition_count++;
723 if (longest_alpha_repetition_count < alpha_repetition_count) {
724 longest_alpha_repetition_count = alpha_repetition_count;
729 alpha_repetition_count = 1;
738 alpha_repetition_count = 1;
739 upper_string_count = 1;
746 case SUBSEQUENT_LOWER:
748 state = SUBSEQUENT_LOWER;
749 lower_string_count++;
750 if (longest_lower_run_len < lower_string_count)
751 longest_lower_run_len = lower_string_count;
753 alpha_repetition_count++;
754 if (longest_alpha_repetition_count < alpha_repetition_count) {
755 longest_alpha_repetition_count = alpha_repetition_count;
760 alpha_repetition_count = 1;
769 alpha_repetition_count = 1;
770 lower_string_count = 1;
778 state = SUBSEQUENT_NUM;
791 if (*lengths == 1 && *str ==
' ')
821 total_alpha_count += total_digit_count - isolated_digits;
825 2 * (total_alpha_count - isolated_alphas) > len &&
835 strpbrk(str,
" ") ==
nullptr &&
844 ok_chars = len - bad_char_count - isolated_digits -
845 isolated_alphas - tess_rejs;
848 tprintf(
"garbage_word: \"%s\"\n",
850 tprintf(
"LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
852 bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
854 if (bad_char_count == 0 &&
856 (len > isolated_digits + isolated_alphas || len <= 2))
859 if (tess_rejs > ok_chars ||
860 (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
864 dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
866 if (dodgy_chars > 5 || (dodgy_chars /
static_cast<float>(len)) > 0.5)
871 dodgy_chars = 2 * tess_rejs + bad_char_count;
872 if ((len == 4 && dodgy_chars > 2) ||
873 (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
◆ get_rep_char()
◆ get_sub_lang()
Tesseract* tesseract::Tesseract::get_sub_lang |
( |
int |
index | ) |
const |
|
inline |
◆ getDict()
Dict & tesseract::Tesseract::getDict |
( |
| ) |
|
|
overridevirtual |
◆ GetLineData()
Definition at line 136 of file linerec.cpp.
144 if (image_data ==
nullptr)
return nullptr;
151 for (
int b = start_box; b < end_box; ++b) {
153 box.
rotate(block_rotation);
160 image_data->AddBoxes(line_boxes, line_texts, page_numbers);
◆ GetRectImage()
ImageData * tesseract::Tesseract::GetRectImage |
( |
const TBOX & |
box, |
|
|
const BLOCK & |
block, |
|
|
int |
padding, |
|
|
TBOX * |
revised_box |
|
) |
| const |
Definition at line 170 of file linerec.cpp.
173 wbox.
pad(padding, padding);
177 int num_rotations = 0;
191 int width = pixGetWidth(pix);
192 int height = pixGetHeight(pix);
193 TBOX image_box(0, 0, width, height);
195 *revised_box &= image_box;
196 if (revised_box->
null_box())
return nullptr;
197 Box* clip_box = boxCreate(revised_box->
left(), height - revised_box->
top(),
199 Pix* box_pix = pixClipRectangle(pix, clip_box,
nullptr);
200 if (box_pix ==
nullptr)
return nullptr;
201 boxDestroy(&clip_box);
202 if (num_rotations > 0) {
203 Pix* rot_pix = pixRotateOrth(box_pix, num_rotations);
204 pixDestroy(&box_pix);
208 int depth = pixGetDepth(box_pix);
211 grey = pixConvertTo8(box_pix,
false);
212 pixDestroy(&box_pix);
215 bool vertical_text =
false;
216 if (num_rotations > 0) {
219 revised_box->
rotate(rotation);
220 if (num_rotations != 2)
221 vertical_text =
true;
223 return new ImageData(vertical_text, box_pix);
◆ GetSubAndSuperscriptCandidates()
void tesseract::Tesseract::GetSubAndSuperscriptCandidates |
( |
const WERD_RES * |
word, |
|
|
int * |
num_rebuilt_leading, |
|
|
ScriptPos * |
leading_pos, |
|
|
float * |
leading_certainty, |
|
|
int * |
num_rebuilt_trailing, |
|
|
ScriptPos * |
trailing_pos, |
|
|
float * |
trailing_certainty, |
|
|
float * |
avg_certainty, |
|
|
float * |
unlikely_threshold |
|
) |
| |
Determine how many characters (rebuilt blobs) on each end of a given word might plausibly be superscripts so SubAndSuperscriptFix can try to re-recognize them. Even if we find no whole blobs at either end, we will set *unlikely_threshold to a certainty that might be used to select "bad enough" outlier characters. If *unlikely_threshold is set to 0, though, there's really no hope.
- Parameters
-
[in] | word | The word to examine. |
[out] | num_rebuilt_leading | the number of rebuilt blobs at the start of the word which are all up or down and seem badly classified. |
[out] | leading_pos | "super" or "sub" (for debugging) |
[out] | leading_certainty | the worst certainty in the leading blobs. |
[out] | num_rebuilt_trailing | the number of rebuilt blobs at the end of the word which are all up or down and seem badly classified. |
[out] | trailing_pos | "super" or "sub" (for debugging) |
[out] | trailing_certainty | the worst certainty in the trailing blobs. |
[out] | avg_certainty | the average certainty of "normal" blobs in the word. |
[out] | unlikely_threshold | the threshold (on certainty) we used to select "bad enough" outlier characters. |
Definition at line 254 of file superscript.cpp.
263 *avg_certainty = *unlikely_threshold = 0.0f;
264 *num_rebuilt_leading = *num_rebuilt_trailing = 0;
265 *leading_certainty = *trailing_certainty = 0.0f;
275 *leading_pos = *trailing_pos =
SP_NORMAL;
276 int leading_outliers = 0;
277 int trailing_outliers = 0;
279 float normal_certainty_total = 0.0f;
280 float worst_normal_certainty = 0.0f;
283 for (
int b = 0; b < num_blobs; ++b) {
286 if (box.
bottom() >= super_y_bottom) {
288 }
else if (box.
top() <= sub_y_top) {
294 if (char_certainty < worst_normal_certainty) {
295 worst_normal_certainty = char_certainty;
298 normal_certainty_total += char_certainty;
300 if (trailing_outliers == b) {
301 leading_outliers = trailing_outliers;
302 *leading_pos = last_pos;
304 trailing_outliers = 0;
306 if (last_pos == pos) {
309 trailing_outliers = 1;
314 *trailing_pos = last_pos;
315 if (num_normal >= 3) {
317 normal_certainty_total -= worst_normal_certainty;
319 if (num_normal > 0) {
320 *avg_certainty = normal_certainty_total / num_normal;
323 if (num_normal == 0 ||
324 (leading_outliers == 0 && trailing_outliers == 0)) {
331 for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0;
332 *num_rebuilt_leading < leading_outliers;
333 (*num_rebuilt_leading)++) {
335 if (char_certainty > *unlikely_threshold) {
338 if (char_certainty < *leading_certainty) {
339 *leading_certainty = char_certainty;
344 for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
345 *num_rebuilt_trailing < trailing_outliers;
346 (*num_rebuilt_trailing)++) {
347 int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
349 if (char_certainty > *unlikely_threshold) {
352 if (char_certainty < *trailing_certainty) {
353 *trailing_certainty = char_certainty;
◆ ImageHeight()
int tesseract::Tesseract::ImageHeight |
( |
| ) |
const |
|
inline |
◆ ImageWidth()
int tesseract::Tesseract::ImageWidth |
( |
| ) |
const |
|
inline |
◆ init_recog_training()
FILE * tesseract::Tesseract::init_recog_training |
( |
const STRING & |
fname | ) |
|
Definition at line 36 of file recogtraining.cpp.
44 STRING output_fname = fname;
45 const char* lastdot = strrchr(output_fname.
string(),
'.');
46 if (lastdot !=
nullptr)
47 output_fname[lastdot - output_fname.
string()] =
'\0';
48 output_fname +=
".txt";
49 FILE* output_file = fopen(output_fname.
string(),
"a+");
50 if (output_file ==
nullptr) {
51 tprintf(
"Error: Could not open file %s\n", output_fname.
string());
◆ init_tesseract() [1/2]
Definition at line 304 of file tessedit.cpp.
305 const char* lang_str = langs_to_load[lang_index].string();
307 if (!loaded_primary) {
313 int result = tess_to_init->init_tesseract_internal(
314 arg0, textbase, lang_str, oem, configs, configs_size, vars_vec,
315 vars_values, set_only_non_debug_params, mgr);
319 if (!loaded_primary) {
321 tprintf(
"Failed loading language '%s'\n", lang_str);
324 &langs_to_load, &langs_not_to_load);
325 loaded_primary =
true;
329 tprintf(
"Failed loading language '%s'\n", lang_str);
332 sub_langs_.push_back(tess_to_init);
335 &langs_to_load, &langs_not_to_load);
340 if (!loaded_primary) {
341 tprintf(
"Tesseract couldn't load any languages!\n");
344 #ifndef DISABLED_LEGACY_ENGINE
345 if (!sub_langs_.empty()) {
352 for (
int s = 0; s < sub_langs_.size(); ++s) {
353 sub_langs_[s]->language_model_->getParamsModel().Copy(
356 tprintf(
"Using params model of the primary language\n");
359 for (
int s = 0; s < sub_langs_.size(); ++s) {
360 sub_langs_[s]->language_model_->getParamsModel().Clear();
366 #endif // ndef DISABLED_LEGACY_ENGINE
◆ init_tesseract() [2/2]
int tesseract::Tesseract::init_tesseract |
( |
const char * |
datapath, |
|
|
const char * |
language, |
|
|
OcrEngineMode |
oem |
|
) |
| |
|
inline |
◆ init_tesseract_internal()
Definition at line 404 of file tessedit.cpp.
409 #ifndef DISABLED_LEGACY_ENGINE
414 for (
int i = 0; i < new_fonts.
size(); ++i) {
423 for (
int i = 0; i < lang_fonts->
size(); ++i) {
424 int index = all_fonts.
get_id(lang_fonts->
get(i));
◆ init_tesseract_lang_data()
Definition at line 97 of file tessedit.cpp.
107 if (!mgr->IsLSTMAvailable()) {
109 }
else if (!mgr->IsBaseAvailable()) {
115 #endif // ndef DISABLED_LEGACY_ENGINE
130 for (
int i = 0; i < configs_size; ++i) {
136 if (vars_vec !=
nullptr && vars_values !=
nullptr) {
137 for (
int i = 0; i < vars_vec->
size(); ++i) {
139 (*vars_values)[i].
string(),
140 set_params_constraint, this->
params())) {
141 tprintf(
"Warning: The parameter '%s' was not found.\n", (*vars_vec)[i].
string());
148 if (params_file !=
nullptr) {
152 tprintf(
"Failed to open %s for writing params.\n",
169 #ifndef ANDROID_BUILD
170 # ifdef DISABLED_LEGACY_ENGINE
175 # endif // ndef DISABLED_LEGACY_ENGINE
181 tprintf(
"Error: LSTM requested, but not present!! Loading tesseract.\n");
185 #endif // ndef ANDROID_BUILD
190 #ifndef ANDROID_BUILD
192 #endif // ndef ANDROID_BUILD
194 #ifndef DISABLED_LEGACY_ENGINE
197 tprintf(
"Error: Tesseract (legacy) engine requested, but components are "
198 "not present in %s!!\n", tessdata_path.c_str());
201 #endif // ndef DISABLED_LEGACY_ENGINE
203 tprintf(
"Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
208 #ifndef DISABLED_LEGACY_ENGINE
235 #endif // ndef DISABLED_LEGACY_ENGINE
241 static bool IsStrInList(
const STRING& str,
243 for (
int i = 0; i < str_list.
size(); ++i) {
244 if (str_list[i] == str)
return true;
◆ init_tesseract_lm()
int tesseract::Tesseract::init_tesseract_lm |
( |
const char * |
arg0, |
|
|
const char * |
textbase, |
|
|
const char * |
language, |
|
|
TessdataManager * |
mgr |
|
) |
| |
◆ join_words()
Definition at line 239 of file tfacepp.cpp.
284 if (total_joined_choices >= kTooManyAltChoices &&
285 bc2_index > kAltsPerPiece)
288 for (bc1_it.move_to_first(); bc1_index < num_word1_choices;
289 ++bc1_index, bc1_it.forward()) {
290 if (total_joined_choices >= kTooManyAltChoices &&
291 bc1_index > kAltsPerPiece)
294 *wc += *bc2_it.data();
295 jc_it.add_after_then_move(wc);
296 ++total_joined_choices;
301 bc1_it.move_to_first();
302 bc2_it.move_to_first();
303 for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
304 *bc1_it.data() += *bc2_it.data();
306 bc1_it.move_to_last();
307 bc1_it.add_list_after(&joined_choices);
311 if (orig_bb !=
nullptr) {
◆ LSTMRecognizeWord()
Definition at line 229 of file linerec.cpp.
246 if (im_data ==
nullptr)
return;
◆ make_reject_map()
void tesseract::Tesseract::make_reject_map |
( |
WERD_RES * |
word, |
|
|
ROW * |
row, |
|
|
int16_t |
pass |
|
) |
| |
◆ match_current_words()
void tesseract::Tesseract::match_current_words |
( |
WERD_RES_LIST & |
words, |
|
|
ROW * |
row, |
|
|
BLOCK * |
block |
|
) |
| |
Definition at line 223 of file fixspace.cpp.
225 WERD_RES_IT word_it(&words);
230 for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
231 word = word_it.data();
233 WordData word_data(block, row, word);
◆ match_word_pass_n()
void tesseract::Tesseract::match_word_pass_n |
( |
int |
pass_n, |
|
|
WERD_RES * |
word, |
|
|
ROW * |
row, |
|
|
BLOCK * |
block |
|
) |
| |
match_word_pass2
Baseline normalize the word and pass it to Tess.
Definition at line 1630 of file control.cpp.
1642 tprintf(
"POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
◆ MaximallyChopWord()
Tests the chopper by exhaustively running chop_one_blob. The word_res will contain filled chopped_word, seam_array, denorm, box_word and best_state for the maximally chopped word.
Definition at line 243 of file applybox.cpp.
256 tprintf(
"Maximally chopping word at:");
261 auto rating =
static_cast<float>(INT8_MAX);
276 const double e = exp(1.0);
278 int right_chop_index = 0;
281 SEAM* seam =
nullptr;
283 &blob_number)) !=
nullptr) {
285 BLOB_CHOICE* left_choice = blob_choices[blob_number];
286 rating = left_choice->
rating() / e;
290 auto* right_choice =
new BLOB_CHOICE(++right_chop_index,
291 rating - 0.125f, -rating, -1,
293 blob_choices.
insert(right_choice, blob_number + 1);
◆ mutable_pix_binary()
Pix** tesseract::Tesseract::mutable_pix_binary |
( |
| ) |
|
|
inline |
◆ mutable_textord()
Textord* tesseract::Tesseract::mutable_textord |
( |
| ) |
|
|
inline |
◆ nn_match_word()
void tesseract::Tesseract::nn_match_word |
( |
WERD_RES * |
word, |
|
|
ROW * |
row |
|
) |
| |
◆ nn_recover_rejects()
void tesseract::Tesseract::nn_recover_rejects |
( |
WERD_RES * |
word, |
|
|
ROW * |
row |
|
) |
| |
◆ noise_outlines()
bool tesseract::Tesseract::noise_outlines |
( |
TWERD * |
word | ) |
|
Definition at line 980 of file docqual.cpp.
982 int16_t outline_count = 0;
983 int16_t small_outline_count = 0;
984 int16_t max_dimension;
987 for (
int b = 0; b < word->
NumBlobs(); ++b) {
991 box = ol->bounding_box();
993 max_dimension = box.
height();
995 max_dimension = box.
width();
996 if (max_dimension < small_limit)
997 small_outline_count++;
1000 return small_outline_count >= outline_count;
◆ non_0_digit()
◆ non_O_upper()
◆ num_sub_langs()
int tesseract::Tesseract::num_sub_langs |
( |
| ) |
const |
|
inline |
◆ one_ell_conflict()
bool tesseract::Tesseract::one_ell_conflict |
( |
WERD_RES * |
word_res, |
|
|
bool |
update_map |
|
) |
| |
Definition at line 298 of file reject.cpp.
348 dict_word_ok = (dict_word_type > 0) &&
353 (dict_perm_type && dict_word_ok)) {
356 if (lengths[first_alphanum_index_] == 1 &&
357 word[first_alphanum_offset_] ==
'I') {
363 setrej_1Il_conflict();
372 if (lengths[first_alphanum_index_] == 1 &&
373 word[first_alphanum_offset_] ==
'l') {
379 setrej_1Il_conflict();
403 if (lengths[first_alphanum_index_] == 1 &&
404 word[first_alphanum_offset_] ==
'l') {
411 else if (lengths[first_alphanum_index_] == 1 &&
412 word[first_alphanum_offset_] ==
'I') {
431 for (i = 0, offset = 0; word[offset] !=
'\0';
433 if ((!allow_1s || (word[offset] !=
'1')) &&
436 word_res->
reject_map[i].setrej_1Il_conflict ();
453 setrej_1Il_conflict ();
471 const char *word_lengths) {
◆ output_pass()
void tesseract::Tesseract::output_pass |
( |
PAGE_RES_IT & |
page_res_it, |
|
|
const TBOX * |
target_word_box |
|
) |
| |
Definition at line 36 of file output.cpp.
45 block_of_last_word =
nullptr;
46 while (page_res_it.
word () !=
nullptr) {
49 if (target_word_box) {
52 (current_word_box.
right() + current_word_box.
left()) / 2,
53 (current_word_box.
bottom() + current_word_box.
top()) / 2);
54 if (!target_word_box->
contains(center_pt)) {
60 block_of_last_word != page_res_it.
block ()) {
61 block_of_last_word = page_res_it.
block ();
80 nextword, nextblock), force_eol);
◆ ParseLanguageString()
Definition at line 272 of file tessedit.cpp.
277 target->push_back(lang_code);
288 char** configs,
int configs_size,
291 bool set_only_non_debug_params,
292 TessdataManager* mgr) {
297 sub_langs_.delete_data_pointers();
◆ pgeditor_main()
void tesseract::Tesseract::pgeditor_main |
( |
int |
width, |
|
|
int |
height, |
|
|
PAGE_RES * |
page_res |
|
) |
| |
pgeditor_main()
Top level editor operation: Setup a new window and an according event handler
Definition at line 378 of file pgedit.cpp.
379 current_page_res = page_res;
386 build_image_window(width, height);
389 #ifndef GRAPHICS_DISABLED
◆ pix_binary()
Pix* tesseract::Tesseract::pix_binary |
( |
| ) |
const |
|
inline |
◆ pix_grey()
Pix* tesseract::Tesseract::pix_grey |
( |
| ) |
const |
|
inline |
◆ pix_original()
Pix* tesseract::Tesseract::pix_original |
( |
| ) |
const |
|
inline |
◆ potential_word_crunch()
bool tesseract::Tesseract::potential_word_crunch |
( |
WERD_RES * |
word, |
|
|
GARBAGE_LEVEL |
garbage_level, |
|
|
bool |
ok_dict_word |
|
) |
| |
Definition at line 541 of file docqual.cpp.
548 bool word_crunchable;
549 int poor_indicator_count = 0;
558 if (adjusted_len > 10)
564 tprintf(
"Potential poor rating on \"%s\"\n",
567 poor_indicator_count++;
570 if (word_crunchable &&
573 tprintf(
"Potential poor cert on \"%s\"\n",
576 poor_indicator_count++;
579 if (garbage_level !=
G_OK) {
581 tprintf(
"Potential garbage on \"%s\"\n",
584 poor_indicator_count++;
◆ PreenXHeights()
void tesseract::Tesseract::PreenXHeights |
( |
BLOCK_LIST * |
block_list | ) |
|
Any row xheight that is significantly different from the median is set to the median.
Definition at line 181 of file applybox.cpp.
182 const double median_xheight = MedianXHeight(block_list);
185 BLOCK_IT b_it(block_list);
186 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
187 BLOCK* block = b_it.data();
189 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
190 ROW* row = r_it.data();
191 const double diff = fabs(row->
x_height() - median_xheight);
192 if (diff > max_deviation) {
194 tprintf(
"row xheight=%g, but median xheight = %g\n",
◆ PrepareForPageseg()
void tesseract::Tesseract::PrepareForPageseg |
( |
| ) |
|
Definition at line 631 of file tesseractclass.cpp.
634 auto max_pageseg_strategy =
637 for (
int i = 0; i < sub_langs_.size(); ++i) {
638 auto pageseg_strategy =
640 static_cast<int32_t
>(sub_langs_[i]->pageseg_devanagari_split_strategy));
641 if (pageseg_strategy > max_pageseg_strategy)
642 max_pageseg_strategy = pageseg_strategy;
643 pixDestroy(&sub_langs_[i]->pix_binary_);
644 sub_langs_[i]->pix_binary_ = pixClone(
pix_binary());
650 if (splitter_.
Split(
true, &pixa_debug_)) {
652 pixDestroy(&pix_binary_);
◆ PrepareForTessOCR()
void tesseract::Tesseract::PrepareForTessOCR |
( |
BLOCK_LIST * |
block_list, |
|
|
Tesseract * |
osd_tess, |
|
|
OSResults * |
osr |
|
) |
| |
Definition at line 662 of file tesseractclass.cpp.
665 auto max_ocr_strategy =
668 for (
int i = 0; i < sub_langs_.size(); ++i) {
671 static_cast<int32_t
>(sub_langs_[i]->ocr_devanagari_split_strategy));
672 if (ocr_strategy > max_ocr_strategy)
673 max_ocr_strategy = ocr_strategy;
679 bool split_for_ocr = splitter_.
Split(
false, &pixa_debug_);
682 pixDestroy(&pix_binary_);
683 pix_binary_ = pixClone(splitter_.
orig_pix());
688 BLOCK block(
"",
true, 0, 0, 0, 0, pixGetWidth(pix_binary_),
689 pixGetHeight(pix_binary_));
◆ PrerecAllWordsPar()
Definition at line 38 of file par_control.cpp.
41 for (
int w = 0; w < words.
size(); ++w) {
42 if (words[w].word->ratings !=
nullptr &&
43 words[w].word->ratings->get(0, 0) ==
nullptr) {
44 for (
int s = 0; s < words[w].lang_words.
size(); ++s) {
45 Tesseract* sub = s < sub_langs_.size() ? sub_langs_[s] :
this;
46 const WERD_RES& word = *words[w].lang_words[s];
56 #pragma omp parallel for num_threads(10)
58 for (
int b = 0; b < blobs.
size(); ++b) {
60 blobs[b].tesseract->classify_blob(blobs[b].blob,
"par",
White,
nullptr);
64 for (
int b = 0; b < blobs.
size(); ++b) {
66 blobs[b].tesseract->classify_blob(blobs[b].blob,
"par",
White,
nullptr);
◆ process_cmd_win_event()
bool tesseract::Tesseract::process_cmd_win_event |
( |
int32_t |
cmd_event, |
|
|
char * |
new_value |
|
) |
| |
Definition at line 415 of file pgedit.cpp.
463 word_config_ = parameter;
467 if (new_value[0] ==
'T')
474 if (new_value[0] ==
'T')
482 if (new_value[0] ==
'T')
489 if (new_value[0] ==
'T')
496 if (new_value[0] ==
'T')
503 if (new_value[0] ==
'T')
513 display_image =(new_value[0] ==
'T');
517 display_blocks =(new_value[0] ==
'T');
521 display_baselines =(new_value[0] ==
'T');
569 snprintf(msg,
sizeof(msg),
"Unrecognised event %" PRId32
"(%s)",
570 cmd_event, new_value);
◆ process_image_event()
void tesseract::Tesseract::process_image_event |
( |
const SVEvent & |
event | ) |
|
process_image_event()
User has done something in the image window - mouse down or up. Work out what it is and do something with it. If DOWN - just remember where it was. If UP - for each word in the selected area do the operation defined by the current mode.
Definition at line 587 of file pgedit.cpp.
603 show_point(current_page_res, event.
x, event.
y);
609 selection_box =
TBOX(down, up);
635 #ifndef DISABLED_LEGACY_ENGINE
636 image_win->
AddMessage(
"Recogging selected words");
640 #endif // ndef DISABLED_LEGACY_ENGINE
643 image_win->
AddMessage(
"Recogging selected blobs");
651 sprintf(msg,
"Mode %d not yet implemented", mode);
◆ process_selected_words()
void tesseract::Tesseract::process_selected_words |
( |
PAGE_RES * |
page_res, |
|
|
TBOX & |
selection_box, |
|
|
bool(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it) |
word_processor |
|
) |
| |
◆ ProcessTargetWord()
bool tesseract::Tesseract::ProcessTargetWord |
( |
const TBOX & |
word_box, |
|
|
const TBOX & |
target_word_box, |
|
|
const char * |
word_config, |
|
|
int |
pass |
|
) |
| |
Definition at line 120 of file control.cpp.
124 if (word_config !=
nullptr) {
126 if (backup_config_file_ ==
nullptr) {
128 FILE* config_fp = fopen(backup_config_file_,
"wb");
129 if (config_fp ==
nullptr) {
130 tprintf(
"Error, failed to open file \"%s\"\n", backup_config_file_);
140 if (backup_config_file_ !=
nullptr) {
144 backup_config_file_ =
nullptr;
147 }
else if (pass > 1 && !word_box.
major_overlap(target_word_box)) {
◆ quality_based_rejection()
void tesseract::Tesseract::quality_based_rejection |
( |
PAGE_RES_IT & |
page_res_it, |
|
|
bool |
good_quality_doc |
|
) |
| |
◆ read_config_file()
void tesseract::Tesseract::read_config_file |
( |
const char * |
filename, |
|
|
SetParamConstraint |
constraint |
|
) |
| |
◆ ReassignDiacritics()
bool tesseract::Tesseract::ReassignDiacritics |
( |
int |
pass, |
|
|
PAGE_RES_IT * |
pr_it, |
|
|
bool * |
make_next_word_fuzzy |
|
) |
| |
Definition at line 945 of file control.cpp.
947 *make_next_word_fuzzy =
false;
961 &word_wanted, &overlapped_any_blob,
969 int num_overlapped = 0;
970 int num_overlapped_used = 0;
971 for (
int i = 0; i < overlapped_any_blob.
size(); ++i) {
972 if (overlapped_any_blob[i]) {
974 if (word_wanted[i]) ++num_overlapped_used;
978 outlines[i] =
nullptr;
984 int non_overlapped = 0;
985 int non_overlapped_used = 0;
986 for (
int i = 0; i < word_wanted.
size(); ++i) {
987 if (word_wanted[i]) ++non_overlapped_used;
988 if (outlines[i] !=
nullptr) ++non_overlapped_used;
991 tprintf(
"Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
992 num_overlapped_used, num_overlapped, non_overlapped_used,
998 make_next_word_fuzzy)) {
1003 return num_overlapped_used != 0 || non_overlapped_used != 0;
◆ recog_all_words()
bool tesseract::Tesseract::recog_all_words |
( |
PAGE_RES * |
page_res, |
|
|
ETEXT_DESC * |
monitor, |
|
|
const TBOX * |
target_word_box, |
|
|
const char * |
word_config, |
|
|
int |
dopasses |
|
) |
| |
recog_all_words()
Walk the page_res, recognizing all the words. If monitor is not null, it is used as a progress monitor/timeout/cancel. If dopasses is 0, all recognition passes are run, 1 just pass 1, 2 passes2 and higher. If target_word_box is not null, special things are done to words that overlap the target_word_box: if word_config is not null, the word config file is read for just the target word(s), otherwise, on pass 2 and beyond ONLY the target words are processed (Jetsoft modification.) Returns false if we cancelled prematurely.
- Parameters
-
page_res | page structure |
monitor | progress monitor |
word_config | word_config file |
target_word_box | specifies just to extract a rectangle |
dopasses | 0 - all, 1 just pass 1, 2 passes 2 and higher |
Definition at line 302 of file control.cpp.
314 if (dopasses==0 || dopasses==1) {
315 page_res_it.restart_page();
318 #ifndef DISABLED_LEGACY_ENGINE
329 for (
int i = 0; i < sub_langs_.size(); ++i) {
331 sub_langs_[i]->SwitchAdaptiveClassifier();
333 sub_langs_[i]->StartBackupAdaptiveClassifier();
337 #endif // ndef DISABLED_LEGACY_ENGINE
343 #ifndef DISABLED_LEGACY_ENGINE
347 #endif // ndef DISABLED_LEGACY_ENGINE
358 most_recently_used_ =
this;
362 for (page_res_it.restart_page(); page_res_it.word() !=
nullptr;
363 page_res_it.forward()) {
364 if (page_res_it.word()->word->flag(
W_REP_CHAR)) {
370 if (page_res_it.word()->best_choice->permuter() ==
USER_DAWG_PERM)
375 if (page_res_it.word()->blamer_bundle !=
nullptr &&
376 page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
378 page_res_it.word()->blamer_bundle->misadaption_debug());
383 if (dopasses == 1)
return true;
385 #ifndef DISABLED_LEGACY_ENGINE
390 page_res_it.restart_page();
396 most_recently_used_ =
this;
427 #endif // ndef DISABLED_LEGACY_ENGINE
434 #ifndef DISABLED_LEGACY_ENGINE
440 #endif //ndef DISABLED_LEGACY_ENGINE
442 const auto pageseg_mode =
static_cast<PageSegMode>(
447 for (page_res_it.restart_page(); page_res_it.word() !=
nullptr;
448 page_res_it.forward()) {
450 const POLY_BLOCK* pb = page_res_it.block()->block !=
nullptr
451 ? page_res_it.block()->block->pdblk.poly_block()
455 page_res_it.DeleteCurrentWord();
459 if (monitor !=
nullptr) {
◆ recog_interactive()
bool tesseract::Tesseract::recog_interactive |
( |
PAGE_RES_IT * |
pr_it | ) |
|
Recognize a single word in interactive mode.
- Parameters
-
pr_it | the page results iterator |
Definition at line 77 of file control.cpp.
79 int16_t good_char_qual;
81 WordData word_data(*pr_it);
84 if (lstm_recognizer_ ==
nullptr) {
85 #ifndef DISABLED_LEGACY_ENGINE
87 #endif // ndef DISABLED_LEGACY_ENGINE
91 #ifndef DISABLED_LEGACY_ENGINE
95 tprintf(
"\n%d chars; word_blob_quality: %d; outline_errs: %d; "
96 "char_quality: %d; good_char_quality: %d\n",
101 #endif // ndef DISABLED_LEGACY_ENGINE
◆ recog_pseudo_word()
void tesseract::Tesseract::recog_pseudo_word |
( |
PAGE_RES * |
page_res, |
|
|
TBOX & |
selection_box |
|
) |
| |
◆ recog_training_segmented()
void tesseract::Tesseract::recog_training_segmented |
( |
const STRING & |
fname, |
|
|
PAGE_RES * |
page_res, |
|
|
volatile ETEXT_DESC * |
monitor, |
|
|
FILE * |
output_file |
|
) |
| |
Definition at line 84 of file recogtraining.cpp.
89 const char* lastdot = strrchr(box_fname.
string(),
'.');
90 if (lastdot !=
nullptr)
91 box_fname[lastdot - box_fname.
string()] =
'\0';
94 FILE* box_file = fopen(box_fname.
string(),
"r");
95 if (box_file ==
nullptr) {
96 tprintf(
"Error: Could not open file %s\n", box_fname.
string());
110 int examined_words = 0;
112 keep_going = read_t(&page_res_it, &tbox);
120 keep_going = read_t(&page_res_it, &tbox);
130 keep_going = read_t(&page_res_it, &tbox);
144 }
while (keep_going);
153 if (page_res_it.
word()) {
159 if (examined_words < 0.85 * total_words) {
161 "TODO(antonova): clean up recog_training_segmented; "
162 " It examined only a small fraction of the ambigs image.\n");
164 tprintf(
"recog_training_segmented: examined %d / %d words.\n", examined_words,
◆ recog_word()
void tesseract::Tesseract::recog_word |
( |
WERD_RES * |
word | ) |
|
Definition at line 45 of file tfacepp.cpp.
51 tprintf(
"recog_word ASSERT FAIL String:\"%s\"; "
52 "Strlen=%d; #Blobs=%d\n",
60 tprintf(
"Not all words have valid states relative to ratings matrix!!");
80 tprintf(
"Permuter Type Flipped from %d to %d\n",
◆ recog_word_recursive()
void tesseract::Tesseract::recog_word_recursive |
( |
WERD_RES * |
word | ) |
|
Definition at line 109 of file tfacepp.cpp.
115 tprintf(
"recog_word: Discarded long string \"%s\""
116 " (%d characters vs %d blobs)\n",
◆ RecogAllWordsPassN()
Definition at line 213 of file control.cpp.
222 for (
int w = 0; w < words->
size(); ++w) {
223 WordData* word = &(*words)[w];
224 if (w > 0) word->prev_word = &(*words)[w - 1];
225 if (monitor !=
nullptr) {
241 for (; w < words->
size(); ++w) {
247 if (word->word->tess_failed) {
249 for (s = 0; s < word->lang_words.size() &&
250 word->lang_words[s]->tess_failed; ++s) {}
252 if (s > word->lang_words.size())
continue;
255 while (pr_it->
word() !=
nullptr && pr_it->
word() != word->
word)
258 bool make_next_word_fuzzy =
false;
259 #ifndef DISABLED_LEGACY_ENGINE
265 #endif // ndef DISABLED_LEGACY_ENGINE
269 tprintf(
"Pass%d: %s [%s]\n", pass_n,
270 word->word->best_choice->unichar_string().string(),
271 word->word->best_choice->debug_string().string());
274 if (make_next_word_fuzzy && pr_it->
word() !=
nullptr) {
◆ recognize_page()
void tesseract::Tesseract::recognize_page |
( |
STRING & |
image_name | ) |
|
◆ reject_edge_blobs()
void tesseract::Tesseract::reject_edge_blobs |
( |
WERD_RES * |
word | ) |
|
Definition at line 269 of file reject.cpp.
274 for (
int blobindex = 0; blobindex < blobcount; blobindex++) {
280 word->
reject_map[blobindex].setrej_edge_char();
◆ reject_I_1_L()
void tesseract::Tesseract::reject_I_1_L |
( |
WERD_RES * |
word | ) |
|
◆ reject_mostly_rejects()
void tesseract::Tesseract::reject_mostly_rejects |
( |
WERD_RES * |
word | ) |
|
Definition at line 579 of file reject.cpp.
584 int16_t char_quality;
585 int16_t accepted_char_quality;
◆ rejection_passes()
void tesseract::Tesseract::rejection_passes |
( |
PAGE_RES * |
page_res, |
|
|
ETEXT_DESC * |
monitor, |
|
|
const TBOX * |
target_word_box, |
|
|
const char * |
word_config |
|
) |
| |
Definition at line 612 of file control.cpp.
624 if (monitor !=
nullptr) {
630 page_res_it.forward();
637 if (target_word_box &&
639 *target_word_box, word_config, 4)) {
640 page_res_it.forward();
645 page_res_it.rej_stat_word();
653 int16_t all_char_quality;
654 int16_t accepted_all_char_quality;
656 &all_char_quality, &accepted_all_char_quality);
667 (blob_quality == 0) && (outline_errs >= chars_in_word))
670 page_res_it.forward();
675 (
"QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
676 " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
690 bool good_quality_doc =
◆ repeated_nonalphanum_wd()
bool tesseract::Tesseract::repeated_nonalphanum_wd |
( |
WERD_RES * |
word, |
|
|
ROW * |
row |
|
) |
| |
Definition at line 588 of file reject.cpp.
602 (char_quality == accepted_char_quality))
◆ ReportFailedBox()
void tesseract::Tesseract::ReportFailedBox |
( |
int |
boxfile_lineno, |
|
|
TBOX |
box, |
|
|
const char * |
box_ch, |
|
|
const char * |
err_msg |
|
) |
| |
◆ ReportXhtFixResult()
void tesseract::Tesseract::ReportXhtFixResult |
( |
bool |
accept_new_word, |
|
|
float |
new_x_ht, |
|
|
WERD_RES * |
word, |
|
|
WERD_RES * |
new_word |
|
) |
| |
Definition at line 1462 of file control.cpp.
1464 tprintf(
"New XHT Match:%s = %s ",
1475 new_x_ht > 0.1 ?
"STILL DOUBT" :
"OK",
1476 accept_new_word ?
"ACCEPTED" :
"");
◆ ReSegmentByClassification()
void tesseract::Tesseract::ReSegmentByClassification |
( |
PAGE_RES * |
page_res | ) |
|
◆ ResegmentCharBox()
bool tesseract::Tesseract::ResegmentCharBox |
( |
PAGE_RES * |
page_res, |
|
|
const TBOX * |
prev_box, |
|
|
const TBOX & |
box, |
|
|
const TBOX * |
next_box, |
|
|
const char * |
correct_text |
|
) |
| |
Gather consecutive blobs that match the given box into the best_state and corresponding correct_text.
Fights over which box owns which blobs are settled by pre-chopping and applying the blobs to box or next_box with the least non-overlap.
- Returns
- false if the box was in error, which can only be caused by failing to find an appropriate blob for a box.
This means that occasionally, blobs may be incorrectly segmented if the chopper fails to find a suitable chop point.
Definition at line 329 of file applybox.cpp.
◆ ResegmentWordBox()
bool tesseract::Tesseract::ResegmentWordBox |
( |
BLOCK_LIST * |
block_list, |
|
|
const TBOX & |
box, |
|
|
const TBOX * |
next_box, |
|
|
const char * |
correct_text |
|
) |
| |
◆ ResetAdaptiveClassifier()
void tesseract::Tesseract::ResetAdaptiveClassifier |
( |
| ) |
|
Definition at line 587 of file tesseractclass.cpp.
589 for (
int i = 0; i < sub_langs_.size(); ++i) {
590 sub_langs_[i]->ResetAdaptiveClassifierInternal();
◆ ResetDocumentDictionary()
void tesseract::Tesseract::ResetDocumentDictionary |
( |
| ) |
|
Definition at line 597 of file tesseractclass.cpp.
599 for (
int i = 0; i < sub_langs_.size(); ++i) {
600 sub_langs_[i]->getDict().ResetDocumentDictionary();
◆ reskew()
const FCOORD& tesseract::Tesseract::reskew |
( |
| ) |
const |
|
inline |
◆ RetryWithLanguage()
Definition at line 904 of file control.cpp.
909 tprintf(
"Trying word using lang %s, oem %d\n",
913 PointerVector<WERD_RES> new_words;
914 (this->*recognizer)(word_data, in_word, &new_words);
915 if (new_words.empty()) {
918 new_words.push_back(*in_word);
922 for (
int i = 0; i < new_words.size(); ++i)
923 new_words[i]->DebugTopChoice(
"Lang result");
929 debug, &new_words, best_words);
◆ right_to_left()
bool tesseract::Tesseract::right_to_left |
( |
| ) |
const |
|
inline |
◆ RunOldFixXht()
bool tesseract::Tesseract::RunOldFixXht |
( |
WERD_RES * |
word, |
|
|
BLOCK * |
block, |
|
|
ROW * |
row |
|
) |
| |
◆ safe_dict_word()
int16_t tesseract::Tesseract::safe_dict_word |
( |
const WERD_RES * |
werd_res | ) |
|
◆ scaled_color()
Pix* tesseract::Tesseract::scaled_color |
( |
| ) |
const |
|
inline |
◆ scaled_factor()
int tesseract::Tesseract::scaled_factor |
( |
| ) |
const |
|
inline |
◆ script_pos_pass()
void tesseract::Tesseract::script_pos_pass |
( |
PAGE_RES * |
page_res | ) |
|
Definition at line 734 of file control.cpp.
736 for (page_res_it.restart_page(); page_res_it.word() !=
nullptr;
737 page_res_it.forward()) {
740 page_res_it.forward();
743 const float x_height = page_res_it.block()->block->x_height();
744 float word_x_height = word->
x_height;
745 if (word_x_height < word->best_choice->min_x_height() ||
753 const double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
755 small_cap_xheight - small_cap_delta <= word_x_height &&
756 word_x_height <= small_cap_xheight + small_cap_delta) {
766 if (num_upper > 0 && num_lower == 0)
◆ SearchForText()
void tesseract::Tesseract::SearchForText |
( |
const GenericVector< BLOB_CHOICE_LIST * > * |
choices, |
|
|
int |
choices_pos, |
|
|
int |
choices_length, |
|
|
const GenericVector< UNICHAR_ID > & |
target_text, |
|
|
int |
text_index, |
|
|
float |
rating, |
|
|
GenericVector< int > * |
segmentation, |
|
|
float * |
best_rating, |
|
|
GenericVector< int > * |
best_segmentation |
|
) |
| |
◆ SearchWords()
Definition at line 259 of file linerec.cpp.
264 const Dict* stopper_dict = lstm_recognizer_->
GetDict();
265 if (stopper_dict ==
nullptr) stopper_dict = &
getDict();
266 bool any_nonspace_delimited =
false;
267 for (
int w = 0; w < words->
size(); ++w) {
271 any_nonspace_delimited =
true;
275 for (
int w = 0; w < words->
size(); ++w) {
295 if (
getDict().stopper_debug_level >= 1) {
296 tprintf(
"Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
◆ SegmentPage()
int tesseract::Tesseract::SegmentPage |
( |
const STRING * |
input_file, |
|
|
BLOCK_LIST * |
blocks, |
|
|
Tesseract * |
osd_tess, |
|
|
OSResults * |
osr |
|
) |
| |
Segment the page according to the current value of tessedit_pageseg_mode. pix_binary_ is used as the source image and should not be nullptr. On return the blocks list owns all the constructed page layout.
Definition at line 115 of file pagesegmain.cpp.
119 BLOCK_IT block_it(blocks);
120 auto* block =
new BLOCK(
"",
true, 0, 0, 0, 0, width, height);
122 block_it.add_to_end(block);
133 BLOBNBOX_LIST diacritic_blobs;
134 int auto_page_seg_ret_val = 0;
135 TO_BLOCK_LIST to_blocks;
139 pageseg_mode, blocks, &to_blocks,
142 return auto_page_seg_ret_val;
146 deskew_ =
FCOORD(1.0f, 0.0f);
147 reskew_ =
FCOORD(1.0f, 0.0f);
149 Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
150 if (pixcleaned !=
nullptr) {
151 pixDestroy(&pix_binary_);
152 pix_binary_ = pixcleaned;
157 if (auto_page_seg_ret_val < 0) {
161 if (blocks->empty()) {
170 textord_.
TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
171 pix_thresholds_, pix_grey_, splitting || cjk_mode,
172 &diacritic_blobs, blocks, &to_blocks);
173 return auto_page_seg_ret_val;
◆ SelectGoodDiacriticOutlines()
Definition at line 1140 of file control.cpp.
1145 float target_cert = certainty_threshold;
1146 if (blob !=
nullptr) {
1150 tprintf(
"No Noise blob classified as %s=%g(%g) at:", best_str.
string(),
1151 target_cert, target_c2);
1161 pr_it, blob, &all_str);
1164 for (
int i = 0; i < test_outlines.
size(); ++i) {
1165 if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
1167 tprintf(
"All Noise blob classified as %s=%g, delta=%g at:",
1168 all_str.
string(), best_cert, best_cert - target_cert);
1174 while (num_outlines > 1 && best_index >= 0 &&
1175 (blob ==
nullptr || best_cert < target_cert || blob !=
nullptr)) {
1178 for (
int i = 0; i < outlines.
size(); ++i) {
1179 if (test_outlines[i]) {
1180 test_outlines[i] =
false;
1186 for (
int j = 0; j < outlines.
size(); ++j) {
1187 if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
1188 tprintf(
"%d", test_outlines[j]);
1190 tprintf(
" blob classified as %s=%g, delta=%g) at:", str.
string(),
1191 cert, cert - target_cert);
1194 if (cert > best_cert) {
1197 best_outlines = test_outlines;
1199 test_outlines[i] =
true;
1202 if (best_index >= 0) {
1203 test_outlines[best_index] =
false;
1207 if (best_cert >= target_cert) {
1209 *ok_outlines = best_outlines;
1211 tprintf(
"%s noise combination ", blob ?
"Adding" :
"New");
1212 for (
int i = 0; i < best_outlines.
size(); ++i) {
1213 tprintf(
"%d", best_outlines[i]);
1215 tprintf(
" yields certainty %g, beating target of %g\n", best_cert,
◆ set_done()
void tesseract::Tesseract::set_done |
( |
WERD_RES * |
word, |
|
|
int16_t |
pass |
|
) |
| |
◆ set_pix_grey()
void tesseract::Tesseract::set_pix_grey |
( |
Pix * |
grey_pix | ) |
|
|
inline |
Definition at line 208 of file tesseractclass.h.
209 pixDestroy(&pix_grey_);
210 pix_grey_ = grey_pix;
◆ set_pix_original()
void tesseract::Tesseract::set_pix_original |
( |
Pix * |
original_pix | ) |
|
|
inline |
Definition at line 216 of file tesseractclass.h.
217 pixDestroy(&pix_original_);
218 pix_original_ = original_pix;
220 for (
int i = 0; i < sub_langs_.size(); ++i) {
221 sub_langs_[i]->set_pix_original(original_pix ? pixClone(original_pix)
◆ set_pix_thresholds()
void tesseract::Tesseract::set_pix_thresholds |
( |
Pix * |
thresholds | ) |
|
|
inline |
Definition at line 242 of file tesseractclass.h.
243 pixDestroy(&pix_thresholds_);
244 pix_thresholds_ = thresholds;
◆ set_source_resolution()
void tesseract::Tesseract::set_source_resolution |
( |
int |
ppi | ) |
|
|
inline |
◆ set_unlv_suspects()
void tesseract::Tesseract::set_unlv_suspects |
( |
WERD_RES * |
word | ) |
|
Definition at line 277 of file output.cpp.
281 for (i = 0; i < len; i++) {
282 if (word_res->reject_map[i].rejected())
283 word_res->reject_map[i].setrej_minimal_rej_accept();
296 for (i = 0; i < len; ++i) {
297 if (word_res->reject_map[i].rejected() &&
298 uchset.get_isalpha(word.unichar_id(i)))
299 word_res->reject_map[i].setrej_minimal_rej_accept();
310 for (i = 0; i < len; ++i) {
311 if (word_res->reject_map[i].rejected() &&
312 (!uchset.eq(word.unichar_id(i),
" ")))
313 word_res->reject_map[i].setrej_minimal_rej_accept();
317 for (i = 0; i < len; i++) {
318 if (word_res->reject_map[i].rejected()) {
319 if (word_res->reject_map[i].flag(
R_DOC_REJ))
320 word_res->reject_map[i].setrej_minimal_rej_accept();
322 word_res->reject_map[i].setrej_minimal_rej_accept();
323 if (word_res->reject_map[i].flag(
R_ROW_REJ))
324 word_res->reject_map[i].setrej_minimal_rej_accept();
333 for (i = 0; i < len; i++) {
334 if (word_res->reject_map[i].rejected()) {
337 word_res->reject_map[i].setrej_minimal_rej_accept();
341 word_res->reject_map[i].setrej_minimal_rej_accept();
347 word.unichar_string().string(),
348 word.unichar_lengths().string()) !=
351 word.unichar_lengths().string())) {
353 for (i = 0; i < len; i++) {
354 if (word_res->reject_map[i].rejected() &&
355 (!word_res->reject_map[i].perm_rejected() ||
359 word_res->reject_map[i].setrej_minimal_rej_accept();
368 for (
int i = 0; i < word.
length(); ++i) {
◆ set_word_fonts()
void tesseract::Tesseract::set_word_fonts |
( |
WERD_RES * |
word | ) |
|
set_word_fonts
Get the fonts for the word.
Definition at line 1962 of file control.cpp.
1968 #ifndef DISABLED_LEGACY_ENGINE
1970 if (fontinfo_size == 0)
return;
1976 tprintf(
"Examining fonts in %s\n",
1981 if (choice ==
nullptr)
continue;
1983 for (
int f = 0; f < fonts.
size(); ++f) {
1984 const int fontinfo_id = fonts[f].fontinfo_id;
1985 if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
1986 font_total_score[fontinfo_id] += fonts[f].score;
1991 int score1 = 0, score2 = 0;
1992 int16_t font_id1 = -1, font_id2 = -1;
1993 for (
int f = 0; f < fontinfo_size; ++f) {
1995 tprintf(
"Font %s, total score = %d\n",
1998 if (font_total_score[f] > score1) {
2000 font_id2 = font_id1;
2001 score1 = font_total_score[f];
2003 }
else if (font_total_score[f] > score2) {
2004 score2 = font_total_score[f];
2018 tprintf(
"Word modal font=%s, score=%d, 2nd choice %s/%d\n",
2023 tprintf(
"Word modal font=%s, score=%d. No 2nd choice\n",
2028 #endif // ndef DISABLED_LEGACY_ENGINE
◆ SetBlackAndWhitelist()
void tesseract::Tesseract::SetBlackAndWhitelist |
( |
| ) |
|
Definition at line 604 of file tesseractclass.cpp.
609 if (lstm_recognizer_) {
616 for (
int i = 0; i < sub_langs_.size(); ++i) {
617 sub_langs_[i]->unicharset.set_black_and_whitelist(
620 if (sub_langs_[i]->lstm_recognizer_) {
621 UNICHARSET& lstm_unicharset =
const_cast<UNICHARSET&
> (sub_langs_[i]->lstm_recognizer_->GetUnicharset());
◆ SetEquationDetect()
void tesseract::Tesseract::SetEquationDetect |
( |
EquationDetect * |
detector | ) |
|
◆ SetScaledColor()
void tesseract::Tesseract::SetScaledColor |
( |
int |
factor, |
|
|
Pix * |
color |
|
) |
| |
|
inline |
Definition at line 264 of file tesseractclass.h.
265 scaled_factor_ = factor;
266 scaled_color_ = color;
◆ SetupAllWordsPassN()
void tesseract::Tesseract::SetupAllWordsPassN |
( |
int |
pass_n, |
|
|
const TBOX * |
target_word_box, |
|
|
const char * |
word_config, |
|
|
PAGE_RES * |
page_res, |
|
|
GenericVector< WordData > * |
words |
|
) |
| |
If tesseract is to be run, sets the words up ready for it.
Definition at line 154 of file control.cpp.
161 for (page_res_it.restart_page(); page_res_it.word() !=
nullptr;
162 page_res_it.forward()) {
163 if (target_word_box ==
nullptr ||
165 *target_word_box, word_config, 1)) {
170 for (
int w = 0; w < words->
size(); ++w) {
172 if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
◆ SetupApplyBoxes()
Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: All fuzzy spaces are removed, and all the words are maximally chopped.
Definition at line 207 of file applybox.cpp.
211 BLOCK_IT b_it(block_list);
212 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
213 BLOCK* block = b_it.data();
215 for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
216 ROW* row = r_it.data();
218 for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
219 WERD* word = w_it.data();
221 delete w_it.extract();
229 auto* page_res =
new PAGE_RES(
false, block_list,
nullptr);
232 while ((word_res = pr_it.word()) !=
nullptr) {
234 pr_it.row()->row, word_res);
◆ SetupPageSegAndDetectOrientation()
ColumnFinder * tesseract::Tesseract::SetupPageSegAndDetectOrientation |
( |
PageSegMode |
pageseg_mode, |
|
|
BLOCK_LIST * |
blocks, |
|
|
Tesseract * |
osd_tess, |
|
|
OSResults * |
osr, |
|
|
TO_BLOCK_LIST * |
to_blocks, |
|
|
Pix ** |
photo_mask_pix, |
|
|
Pix ** |
music_mask_pix |
|
) |
| |
Sets up auto page segmentation, determines the orientation, and corrects it. Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to facilitate testing. photo_mask_pix is a pointer to a nullptr pointer that will be filled on return with the leptonica photo mask, which must be pixDestroyed by the caller. to_blocks is an empty list that will be filled with (usually a single) block that is used during layout analysis. This ugly API is required because of the possibility of a unlv zone file. TODO(rays) clean this up. See AutoPageSeg for other arguments. The returned ColumnFinder must be deleted after use.
Definition at line 286 of file pagesegmain.cpp.
290 pixa_debug_.
AddPix(pix_binary_,
"NoLines");
295 pixa_debug_.
AddPix(pix_binary_,
"NoImages");
302 TO_BLOCK_IT to_block_it(to_blocks);
306 TO_BLOCK* to_block = to_block_it.data();
308 ColumnFinder* finder =
nullptr;
309 int estimated_resolution = source_resolution_;
314 estimated_resolution = res;
315 tprintf(
"Estimating resolution as %d\n", estimated_resolution);
320 finder =
new ColumnFinder(
static_cast<int>(to_block->
line_size),
324 &h_lines, vertical_x, vertical_y);
326 finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);
328 #ifndef DISABLED_LEGACY_ENGINE
334 BLOBNBOX_CLIST osd_blobs;
339 int osd_orientation = 0;
346 to_block, &osd_blobs);
348 if (
PSM_OSD_ENABLED(pageseg_mode) && osd_tess !=
nullptr && osr !=
nullptr) {
350 if (osd_tess !=
this) {
353 AddAllScriptsConverted(
unicharset, osd_tess->unicharset, &osd_scripts);
354 for (
int s = 0; s < sub_langs_.size(); ++s) {
355 AddAllScriptsConverted(sub_langs_[s]->
unicharset,
356 osd_tess->unicharset, &osd_scripts);
367 for (
int i = 0; i < 4; ++i) {
368 if (i != osd_orientation &&
374 const char* best_script_str =
375 osd_tess->unicharset.get_script_from_script_id(best_script_id);
376 bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
377 best_script_id == osd_tess->unicharset.hiragana_sid() ||
378 best_script_id == osd_tess->unicharset.katakana_sid() ||
379 strcmp(
"Japanese", best_script_str) == 0 ||
380 strcmp(
"Korean", best_script_str) == 0 ||
381 strcmp(
"Hangul", best_script_str) == 0;
383 finder->set_cjk_script(
true);
387 if (!cjk && !vertical_text && osd_orientation == 2) {
389 tprintf(
"OSD: Weak margin (%.2f), horiz textlines, not CJK: "
390 "Don't rotate.\n", osd_margin);
394 "OSD: Weak margin (%.2f) for %d blob text block, "
395 "but using orientation anyway: %d\n",
396 osd_margin, osd_blobs.length(), osd_orientation);
400 osd_blobs.shallow_clear();
401 finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
403 #endif // ndef DISABLED_LEGACY_ENGINE
◆ SetupUniversalFontIds()
void tesseract::Tesseract::SetupUniversalFontIds |
( |
| ) |
|
Definition at line 449 of file tessedit.cpp.
455 nullptr, 0,
nullptr,
nullptr,
false, mgr))
463 #endif // ndef DISABLED_LEGACY_ENGINE
◆ SetupWordPassN()
void tesseract::Tesseract::SetupWordPassN |
( |
int |
pass_n, |
|
|
WordData * |
word |
|
) |
| |
Definition at line 177 of file control.cpp.
178 if (pass_n == 1 || !word->word->done) {
185 word->row, word->block);
186 }
else if (pass_n == 2) {
188 word->word->caps_height = 0.0;
189 if (word->word->x_height == 0.0f)
190 word->word->x_height = word->row->x_height();
192 word->lang_words.truncate(0);
193 for (
int s = 0; s <= sub_langs_.size(); ++s) {
195 Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] :
this;
198 word->lang_words.push_back(word_res);
200 if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode !=
OEM_LSTM_ONLY) {
201 word_res->SetupForRecognition(
202 lang_t->unicharset, lang_t,
BestPix(),
203 lang_t->tessedit_ocr_engine_mode,
nullptr,
204 lang_t->classify_bln_numeric_mode,
205 lang_t->textord_use_cjk_fp_model,
206 lang_t->poly_allow_detailed_fx, word->row, word->block);
◆ SetupWordScripts()
void tesseract::Tesseract::SetupWordScripts |
( |
BLOCK_LIST * |
blocks | ) |
|
◆ source_resolution()
int tesseract::Tesseract::source_resolution |
( |
| ) |
const |
|
inline |
◆ split_and_recog_word()
void tesseract::Tesseract::split_and_recog_word |
( |
WERD_RES * |
word | ) |
|
◆ split_word()
Definition at line 181 of file tfacepp.cpp.
191 auto *chopped2 =
new TWERD;
193 for (
int i = split_pt; i < chopped->
NumBlobs(); ++i) {
194 chopped2->blobs.push_back(chopped->
blobs[i]);
198 delete word2->chopped_word;
199 word2->chopped_word =
nullptr;
203 word2->ClearResults();
205 word2->chopped_word = chopped2;
207 word2->SetupBasicsFromChoppedWord(
unicharset);
210 if (orig_bb !=
nullptr) {
216 word2->chopped_word->blobs[0]->bounding_box().left(),
221 *right_piece = word2;
222 *orig_blamer_bundle = orig_bb;
◆ SubAndSuperscriptFix()
bool tesseract::Tesseract::SubAndSuperscriptFix |
( |
WERD_RES * |
word | ) |
|
Attempt to split off any high (or low) bits at the ends of the word with poor certainty and recognize them separately. If the certainty gets much better and other sanity checks pass, accept.
This superscript fix is meant to be called in the second pass of recognition when we have tried once and already have a preliminary answer for word.
- Returns
- Whether we modified the given word.
Definition at line 102 of file superscript.cpp.
107 int num_leading, num_trailing;
109 float leading_certainty, trailing_certainty;
110 float avg_certainty, unlikely_threshold;
114 word, &num_leading, &sp_leading, &leading_certainty,
115 &num_trailing, &sp_trailing, &trailing_certainty,
116 &avg_certainty, &unlikely_threshold);
118 const char *leading_pos = sp_leading ==
SP_SUBSCRIPT ?
"sub" :
"super";
119 const char *trailing_pos = sp_trailing ==
SP_SUBSCRIPT ?
"sub" :
"super";
127 int num_remainder_leading = 0, num_remainder_trailing = 0;
128 if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
133 int last_word_char = num_blobs - 1 - num_trailing;
136 last_char_certainty <= unlikely_threshold) {
138 YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top,
139 nullptr,
nullptr, &rpos, &num_remainder_trailing);
140 if (num_trailing > 0 && rpos != sp_trailing) num_remainder_trailing = 0;
141 if (num_remainder_trailing > 0 &&
142 last_char_certainty < trailing_certainty) {
143 trailing_certainty = last_char_certainty;
146 bool another_blob_available = (num_remainder_trailing == 0) ||
147 num_leading + num_trailing + 1 < num_blobs;
149 if (another_blob_available &&
151 first_char_certainty <= unlikely_threshold) {
153 YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top,
154 &lpos, &num_remainder_leading,
nullptr,
nullptr);
155 if (num_leading > 0 && lpos != sp_leading) num_remainder_leading = 0;
156 if (num_remainder_leading > 0 &&
157 first_char_certainty < leading_certainty) {
158 leading_certainty = first_char_certainty;
164 if (num_leading + num_trailing +
165 num_remainder_leading + num_remainder_trailing == 0) {
170 tprintf(
"Candidate for superscript detection: %s (",
172 if (num_leading || num_remainder_leading) {
173 tprintf(
"%d.%d %s-leading ", num_leading, num_remainder_leading,
176 if (num_trailing || num_remainder_trailing) {
177 tprintf(
"%d.%d %s-trailing ", num_trailing, num_remainder_trailing,
186 tprintf(
" Certainties -- Average: %.2f Unlikely thresh: %.2f ",
187 avg_certainty, unlikely_threshold);
189 tprintf(
"Orig. leading (min): %.2f ", leading_certainty);
191 tprintf(
"Orig. trailing (min): %.2f ", trailing_certainty);
198 int num_chopped_leading =
199 LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
200 int num_chopped_trailing =
201 TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
203 int retry_leading = 0;
204 int retry_trailing = 0;
205 bool is_good =
false;
207 num_chopped_leading, leading_certainty, sp_leading,
208 num_chopped_trailing, trailing_certainty, sp_trailing,
209 word, &is_good, &retry_leading, &retry_trailing);
212 }
else if (retry_leading || retry_trailing) {
213 int retry_chopped_leading =
214 LeadingUnicharsToChopped(revised, retry_leading);
215 int retry_chopped_trailing =
216 TrailingUnicharsToChopped(revised, retry_trailing);
218 retry_chopped_leading, leading_certainty, sp_leading,
219 retry_chopped_trailing, trailing_certainty, sp_trailing,
220 revised, &is_good, &retry_leading, &retry_trailing);
◆ terrible_word_crunch()
Definition at line 503 of file docqual.cpp.
524 (garbage_level !=
G_OK))
527 (garbage_level !=
G_OK))
530 if (crunch_mode > 0) {
532 tprintf (
"Terrible_word_crunch (%d) on \"%s\"\n",
◆ tess_acceptable_word()
bool tesseract::Tesseract::tess_acceptable_word |
( |
WERD_RES * |
word | ) |
|
◆ tess_add_doc_word()
void tesseract::Tesseract::tess_add_doc_word |
( |
WERD_CHOICE * |
word_choice | ) |
|
◆ tess_segment_pass_n()
void tesseract::Tesseract::tess_segment_pass_n |
( |
int |
pass_n, |
|
|
WERD_RES * |
word |
|
) |
| |
Definition at line 32 of file tessbox.cpp.
33 int saved_enable_assoc = 0;
34 int saved_chop_enable = 0;
◆ TestNewNormalization()
bool tesseract::Tesseract::TestNewNormalization |
( |
int |
original_misfits, |
|
|
float |
baseline_shift, |
|
|
float |
new_x_ht, |
|
|
WERD_RES * |
word, |
|
|
BLOCK * |
block, |
|
|
ROW * |
row |
|
) |
| |
Definition at line 1519 of file control.cpp.
1522 bool accept_new_x_ht =
false;
1526 new_x_ht_word.blamer_bundle->CopyTruth(*(word->
blamer_bundle));
1528 new_x_ht_word.x_height = new_x_ht;
1529 new_x_ht_word.baseline_shift = baseline_shift;
1530 new_x_ht_word.caps_height = 0.0;
1531 new_x_ht_word.SetupForRecognition(
1536 if (!new_x_ht_word.tess_failed) {
1539 tprintf(
"Old misfits=%d with x-height %f, new=%d with x-height %f\n",
1541 new_misfits, new_x_ht);
1542 tprintf(
"Old rating= %f, certainty=%f, new=%f, %f\n",
1544 new_x_ht_word.best_choice->rating(),
1545 new_x_ht_word.best_choice->certainty());
1548 accept_new_x_ht = new_misfits < original_misfits &&
1549 (new_x_ht_word.best_choice->certainty() >
1551 new_x_ht_word.best_choice->rating() <
1557 if (accept_new_x_ht) {
◆ textord()
const Textord& tesseract::Tesseract::textord |
( |
| ) |
const |
|
inline |
◆ TidyUp()
void tesseract::Tesseract::TidyUp |
( |
PAGE_RES * |
page_res | ) |
|
◆ tilde_crunch()
void tesseract::Tesseract::tilde_crunch |
( |
PAGE_RES_IT & |
page_res_it | ) |
|
Definition at line 417 of file docqual.cpp.
421 bool prev_potential_marked =
false;
422 bool found_terrible_word =
false;
426 while (page_res_it.
word() !=
nullptr) {
428 if (pb !=
nullptr && !pb->
IsText()) {
432 word = page_res_it.
word();
441 found_terrible_word =
false;
443 prev_potential_marked =
false;
452 tprintf (
"T CRUNCHING: \"%s\"\n",
456 if (prev_potential_marked) {
457 while (copy_it.
word () != word) {
459 tprintf (
"P1 CRUNCHING: \"%s\"\n",
465 prev_potential_marked =
false;
467 found_terrible_word =
true;
471 garbage_level, ok_dict_word))) {
472 if (found_terrible_word) {
474 tprintf (
"P2 CRUNCHING: \"%s\"\n",
479 else if (!prev_potential_marked) {
480 copy_it = page_res_it;
481 prev_potential_marked =
true;
483 tprintf (
"P3 CRUNCHING: \"%s\"\n",
489 found_terrible_word =
false;
491 prev_potential_marked =
false;
493 tprintf (
"NO CRUNCH: \"%s\"\n",
◆ tilde_delete()
void tesseract::Tesseract::tilde_delete |
( |
PAGE_RES_IT & |
page_res_it | ) |
|
Definition at line 589 of file docqual.cpp.
592 bool deleting_from_bol =
false;
593 bool marked_delete_point =
false;
594 int16_t debug_delete_mode;
596 int16_t x_debug_delete_mode;
600 while (page_res_it.
word() !=
nullptr) {
601 word = page_res_it.
word();
607 tprintf (
"BOL CRUNCH DELETING(%d): \"%s\"\n",
612 deleting_from_bol =
true;
614 if (marked_delete_point) {
615 while (copy_it.
word() != word) {
617 x_debug_delete_mode);
619 tprintf (
"EOL CRUNCH DELETING(%d): \"%s\"\n",
628 tprintf (
"EOL CRUNCH DELETING(%d): \"%s\"\n",
633 deleting_from_bol =
false;
634 marked_delete_point =
false;
637 if (!marked_delete_point) {
638 copy_it = page_res_it;
639 marked_delete_point =
true;
644 deleting_from_bol =
false;
646 marked_delete_point =
false;
◆ TrainedXheightFix()
bool tesseract::Tesseract::TrainedXheightFix |
( |
WERD_RES * |
word, |
|
|
BLOCK * |
block, |
|
|
ROW * |
row |
|
) |
| |
Definition at line 1485 of file control.cpp.
1487 if (original_misfits == 0)
1489 float baseline_shift = 0.0f;
1491 if (baseline_shift != 0.0f) {
1497 if (original_misfits > 0) {
1498 float new_baseline_shift;
◆ TrainFromBoxes()
Definition at line 81 of file linerec.cpp.
85 int box_count = boxes.
size();
90 while (end_box < texts.
size() && texts[end_box] ==
"\t") ++end_box;
91 for (
int start_box = end_box; start_box < box_count; start_box = end_box) {
93 TBOX line_box = boxes[start_box];
94 STRING line_str = texts[start_box];
95 for (end_box = start_box + 1; end_box < box_count && texts[end_box] !=
"\t";
97 line_box += boxes[end_box];
98 line_str += texts[end_box];
101 BLOCK* best_block =
nullptr;
102 int best_overlap = 0;
103 BLOCK_IT b_it(block_list);
104 for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
105 BLOCK* block = b_it.data();
112 if (overlap_box.
area() > best_overlap) {
113 best_overlap = overlap_box.
area();
118 ImageData* imagedata =
nullptr;
119 if (best_block ==
nullptr) {
120 tprintf(
"No block overlapping textline: %s\n", line_str.
string());
122 imagedata =
GetLineData(line_box, boxes, texts, start_box, end_box,
125 if (imagedata !=
nullptr)
126 training_data->AddPageToDocument(imagedata);
129 while (end_box < texts.
size() && texts[end_box] ==
"\t") ++end_box;
◆ TrainLineRecognizer()
bool tesseract::Tesseract::TrainLineRecognizer |
( |
const STRING & |
input_imagename, |
|
|
const STRING & |
output_basename, |
|
|
BLOCK_LIST * |
block_list |
|
) |
| |
Definition at line 44 of file linerec.cpp.
47 STRING lstmf_name = output_basename +
".lstmf";
48 DocumentData images(lstmf_name);
51 if (!images.LoadDocument(lstmf_name.
c_str(), 0, 0,
nullptr)) {
52 tprintf(
"Failed to read training data from %s!\n", lstmf_name.
c_str());
62 tprintf(
"Failed to read boxes from %s\n", input_imagename.
c_str());
66 if (images.PagesSize() == 0) {
67 tprintf(
"Failed to read pages from %s\n", input_imagename.
c_str());
71 if (!images.SaveDocument(lstmf_name.
c_str(),
nullptr)) {
72 tprintf(
"Failed to write training data to %s!\n", lstmf_name.
c_str());
◆ TrySuperscriptSplits()
WERD_RES * tesseract::Tesseract::TrySuperscriptSplits |
( |
int |
num_chopped_leading, |
|
|
float |
leading_certainty, |
|
|
ScriptPos |
leading_pos, |
|
|
int |
num_chopped_trailing, |
|
|
float |
trailing_certainty, |
|
|
ScriptPos |
trailing_pos, |
|
|
WERD_RES * |
word, |
|
|
bool * |
is_good, |
|
|
int * |
retry_rebuild_leading, |
|
|
int * |
retry_rebuild_trailing |
|
) |
| |
Try splitting off the given number of (chopped) blobs from the front and back of the given word and recognizing the pieces.
- Parameters
-
[in] | num_chopped_leading | how many chopped blobs from the left end of the word to chop off and try recognizing as a superscript (or subscript) |
[in] | leading_certainty | the (minimum) certainty had by the characters in the original leading section. |
[in] | leading_pos | "super" or "sub" (for debugging) |
[in] | num_chopped_trailing | how many chopped blobs from the right end of the word to chop off and try recognizing as a superscript (or subscript) |
[in] | trailing_certainty | the (minimum) certainty had by the characters in the original trailing section. |
[in] | trailing_pos | "super" or "sub" (for debugging) |
[in] | word | the word to try to chop up. |
[out] | is_good | do we believe our result? |
[out] | retry_rebuild_leading,retry_rebuild_trailing | If non-zero, and !is_good, then the caller may have luck trying to split the returned word with this number of (rebuilt) leading and trailing blobs / unichars. |
- Returns
- A word which is the result of re-recognizing as asked.
Definition at line 383 of file superscript.cpp.
392 *retry_rebuild_leading = *retry_rebuild_trailing = 0;
401 if (num_chopped_leading > 0) {
403 split_word(prefix, num_chopped_leading, &core, &bb0);
408 if (num_chopped_trailing > 0) {
409 int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
423 tprintf(
" recognizing first %d chopped blobs\n", num_chopped_leading);
427 tprintf(
" The leading bits look like %s %s\n",
438 tprintf(
" recognizing middle %d chopped blobs\n",
439 num_chopped - num_chopped_leading - num_chopped_trailing);
448 tprintf(
" recognizing last %d chopped blobs\n", num_chopped_trailing);
452 tprintf(
" The trailing bits look like %s %s\n",
467 retry_rebuild_leading,
nullptr);
471 nullptr, retry_rebuild_trailing);
473 *is_good = good_prefix && good_suffix;
474 if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
497 tprintf(
"%s superscript fix: %s\n", *is_good ?
"ACCEPT" :
"REJECT",
◆ unrej_good_chs()
void tesseract::Tesseract::unrej_good_chs |
( |
WERD_RES * |
word, |
|
|
ROW * |
row |
|
) |
| |
◆ unrej_good_quality_words()
void tesseract::Tesseract::unrej_good_quality_words |
( |
PAGE_RES_IT & |
page_res_it | ) |
|
Definition at line 177 of file docqual.cpp.
182 word = page_res_it.
word ();
183 if (word->reject_map.quality_recoverable_rejects() &&
186 word->best_choice->unichar_string().string(),
187 word->best_choice->unichar_lengths().string())
195 current_row = page_res_it.
row ();
196 while ((page_res_it.
word () !=
nullptr) &&
197 (page_res_it.
row () == current_row))
205 current_block =
nullptr;
206 current_row =
nullptr;
207 while (page_res_it.
word () !=
nullptr) {
208 if (current_block != page_res_it.
block ()) {
209 current_block = page_res_it.
block ();
211 current_block->rej_count = 0;
213 if (current_row != page_res_it.
row ()) {
214 current_row = page_res_it.
row ();
216 current_row->rej_count = 0;
217 current_row->whole_word_rej_count = 0;
234 bool good_quality_doc) {
235 int16_t block_no = 0;
◆ word_adaptable()
bool tesseract::Tesseract::word_adaptable |
( |
WERD_RES * |
word, |
|
|
uint16_t |
mode |
|
) |
| |
Definition at line 51 of file adaptions.cpp.
64 if (flags.bit (ADAPTABLE_WERD)) {
67 tprintf(
"tess_would_adapt bit is false\n");
71 if (flags.bit (ACCEPTABLE_WERD)) {
74 tprintf(
"tess_accepted bit is false\n");
82 if (flags.bit (CHECK_DAWGS) &&
96 if (flags.bit (CHECK_SPACES) &&
102 if (flags.bit (CHECK_AMBIG_WERD) &&
109 tprintf(
"returning status %d\n", status);
◆ word_blank_and_set_display()
bool tesseract::Tesseract::word_blank_and_set_display |
( |
PAGE_RES_IT * |
pr_its | ) |
|
◆ word_bln_display()
bool tesseract::Tesseract::word_bln_display |
( |
PAGE_RES_IT * |
pr_it | ) |
|
◆ word_blob_quality()
int16_t tesseract::Tesseract::word_blob_quality |
( |
WERD_RES * |
word, |
|
|
ROW * |
row |
|
) |
| |
◆ word_char_quality()
void tesseract::Tesseract::word_char_quality |
( |
WERD_RES * |
word, |
|
|
ROW * |
row, |
|
|
int16_t * |
match_count, |
|
|
int16_t * |
accepted_match_count |
|
) |
| |
Definition at line 109 of file docqual.cpp.
120 DocQualCallbacks cb(word);
◆ word_contains_non_1_digit()
bool tesseract::Tesseract::word_contains_non_1_digit |
( |
const char * |
word, |
|
|
const char * |
word_lengths |
|
) |
| |
Definition at line 515 of file reject.cpp.
517 (word_lengths[i] != 1 || word[offset] !=
'1'))
◆ word_deletable()
◆ word_display()
bool tesseract::Tesseract::word_display |
( |
PAGE_RES_IT * |
pr_it | ) |
|
word_display() Word Processor
Display a word according to its display modes
Definition at line 751 of file pgedit.cpp.
757 switch (color_mode) {
767 if (font_info.is_italic())
771 if (font_info.is_bold())
775 if (font_info.is_fixed_pitch())
779 if (font_info.is_serif())
783 if (word_res->small_caps)
787 if (best_choice->BlobPosition(i) ==
SP_DROPCAP)
795 image_win->
Pen(color);
796 TBOX box = box_word->BlobBox(i);
802 #endif // ndef DISABLED_LEGACY_ENGINE
809 if (word->display_flag(
DF_BOX)) {
810 word->bounding_box().plot(image_win,
819 C_BLOB_IT c_it(word->cblob_list());
820 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward())
821 c_it.data()->bounding_box().plot(image_win);
822 displayed_something =
true;
827 word->plot(image_win);
828 displayed_something =
true;
835 tword->
plot(image_win);
837 displayed_something =
true;
843 if (word->display_flag(
DF_TEXT) && word->text() !=
nullptr) {
847 !(word_res->blamer_bundle !=
nullptr &&
848 word_res->blamer_bundle->incorrect_result_reason() ==
IRR_CORRECT)) {
850 const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
851 if (blamer_bundle ==
nullptr) {
858 if (word_res->best_choice ==
nullptr) {
859 best_choice_str =
"NULL";
861 word_res->best_choice->string_and_lengths(&best_choice_str,
nullptr);
863 text += best_choice_str;
872 word_bb = word->bounding_box();
874 word_height = word_bb.height();
875 int text_height = 0.50 * word_height;
876 if (text_height > 20) text_height = 20;
877 image_win->
TextAttributes(
"Arial", text_height,
false,
false,
false);
878 shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
879 image_win->
Text(word_bb.left() + shift,
880 word_bb.bottom() + 0.25 * word_height, text.
string());
882 image_win->
Text(word_bb.left() + shift,
883 word_bb.bottom() + 0.25 * word_height - text_height,
887 displayed_something =
true;
890 if (!displayed_something)
891 word->bounding_box().plot(image_win,
897 #endif // GRAPHICS_DISABLED
◆ word_dumper()
bool tesseract::Tesseract::word_dumper |
( |
PAGE_RES_IT * |
pr_it | ) |
|
word_dumper()
Dump members to the debug window
Definition at line 915 of file pgedit.cpp.
916 tprintf(
"Current blamer debug: %s\n",
917 word_res->blamer_bundle->debug().string());
922 #ifndef GRAPHICS_DISABLED
◆ word_outline_errs()
int16_t tesseract::Tesseract::word_outline_errs |
( |
WERD_RES * |
word | ) |
|
Definition at line 89 of file docqual.cpp.
99 *accepted_match_count = 0;
◆ word_set_display()
bool tesseract::Tesseract::word_set_display |
( |
PAGE_RES_IT * |
pr_it | ) |
|
word_set_display() Word processor
Display word according to current display mode settings
Definition at line 939 of file pgedit.cpp.
945 #ifndef DISABLED_LEGACY_ENGINE
◆ worst_noise_blob()
int16_t tesseract::Tesseract::worst_noise_blob |
( |
WERD_RES * |
word_res, |
|
|
float * |
worst_noise_score |
|
) |
| |
Definition at line 707 of file fixspace.cpp.
709 float noise_score[512];
731 tprintf(
"FP fixspace Noise metrics for \"%s\": ",
738 noise_score[i] = non_noise_limit;
743 tprintf(
"%1.1f ", noise_score[i]);
752 if (noise_score[i] >= non_noise_limit) {
764 if (noise_score[i] >= non_noise_limit) {
773 if (min_noise_blob > max_noise_blob)
776 *worst_noise_score = small_limit;
778 for (i = min_noise_blob; i <= max_noise_blob; i++) {
779 if (noise_score[i] < *worst_noise_score) {
781 *worst_noise_score = noise_score[i];
◆ write_results()
void tesseract::Tesseract::write_results |
( |
PAGE_RES_IT & |
page_res_it, |
|
|
char |
newline_type, |
|
|
bool |
force_eol |
|
) |
| |
Definition at line 98 of file output.cpp.
104 bool need_reject =
false;
182 tprintf (
"Dict word: \"%s\": %d\n",
191 word->
reject_map[i].setrej_minimal_rej_accept();
199 word->
reject_map[i].setrej_minimal_rej_accept();
◆ applybox_debug
int tesseract::Tesseract::applybox_debug = 1 |
◆ applybox_exposure_pattern
char* tesseract::Tesseract::applybox_exposure_pattern = ".exp" |
"Exposure value follows this pattern in the image" " filename. The name of the image files are expected" " to be in the form [lang].[fontname].exp[num].tif"
Definition at line 830 of file tesseractclass.h.
◆ applybox_learn_chars_and_char_frags_mode
bool tesseract::Tesseract::applybox_learn_chars_and_char_frags_mode = false |
"Learn both character fragments (as is done in the" " special low exposure mode) as well as unfragmented" " characters."
Definition at line 834 of file tesseractclass.h.
◆ applybox_learn_ngrams_mode
bool tesseract::Tesseract::applybox_learn_ngrams_mode = false |
"Each bounding box is assumed to contain ngrams. Only" " learn the ngrams whose outlines overlap horizontally."
Definition at line 837 of file tesseractclass.h.
◆ applybox_page
int tesseract::Tesseract::applybox_page = 0 |
◆ bidi_debug
int tesseract::Tesseract::bidi_debug = 0 |
◆ bland_unrej
bool tesseract::Tesseract::bland_unrej = false |
◆ chs_leading_punct
char* tesseract::Tesseract::chs_leading_punct = "('`\"" |
◆ chs_trailing_punct1
char* tesseract::Tesseract::chs_trailing_punct1 = ").,;:?!" |
◆ chs_trailing_punct2
char* tesseract::Tesseract::chs_trailing_punct2 = ")'`\"" |
◆ conflict_set_I_l_1
char* tesseract::Tesseract::conflict_set_I_l_1 = "Il1[]" |
◆ crunch_accept_ok
bool tesseract::Tesseract::crunch_accept_ok = true |
◆ crunch_debug
int tesseract::Tesseract::crunch_debug = 0 |
◆ crunch_del_cert
double tesseract::Tesseract::crunch_del_cert = -10.0 |
◆ crunch_del_high_word
double tesseract::Tesseract::crunch_del_high_word = 1.5 |
◆ crunch_del_low_word
double tesseract::Tesseract::crunch_del_low_word = 0.5 |
◆ crunch_del_max_ht
double tesseract::Tesseract::crunch_del_max_ht = 3.0 |
◆ crunch_del_min_ht
double tesseract::Tesseract::crunch_del_min_ht = 0.7 |
◆ crunch_del_min_width
double tesseract::Tesseract::crunch_del_min_width = 3.0 |
◆ crunch_del_rating
double tesseract::Tesseract::crunch_del_rating = 60 |
◆ crunch_early_convert_bad_unlv_chs
bool tesseract::Tesseract::crunch_early_convert_bad_unlv_chs = false |
◆ crunch_early_merge_tess_fails
bool tesseract::Tesseract::crunch_early_merge_tess_fails = true |
◆ crunch_include_numerals
bool tesseract::Tesseract::crunch_include_numerals = false |
◆ crunch_leave_accept_strings
bool tesseract::Tesseract::crunch_leave_accept_strings = false |
◆ crunch_leave_lc_strings
int tesseract::Tesseract::crunch_leave_lc_strings = 4 |
"Don't crunch words with long lower case strings"
Definition at line 960 of file tesseractclass.h.
◆ crunch_leave_ok_strings
bool tesseract::Tesseract::crunch_leave_ok_strings = true |
◆ crunch_leave_uc_strings
int tesseract::Tesseract::crunch_leave_uc_strings = 4 |
"Don't crunch words with long lower case strings"
Definition at line 962 of file tesseractclass.h.
◆ crunch_long_repetitions
int tesseract::Tesseract::crunch_long_repetitions = 3 |
◆ crunch_poor_garbage_cert
double tesseract::Tesseract::crunch_poor_garbage_cert = -9.0 |
◆ crunch_poor_garbage_rate
double tesseract::Tesseract::crunch_poor_garbage_rate = 60 |
◆ crunch_pot_indicators
int tesseract::Tesseract::crunch_pot_indicators = 1 |
◆ crunch_pot_poor_cert
double tesseract::Tesseract::crunch_pot_poor_cert = -8.0 |
◆ crunch_pot_poor_rate
double tesseract::Tesseract::crunch_pot_poor_rate = 40 |
◆ crunch_rating_max
int tesseract::Tesseract::crunch_rating_max = 10 |
◆ crunch_small_outlines_size
double tesseract::Tesseract::crunch_small_outlines_size = 0.6 |
◆ crunch_terrible_garbage
bool tesseract::Tesseract::crunch_terrible_garbage = true |
◆ crunch_terrible_rating
double tesseract::Tesseract::crunch_terrible_rating = 80.0 |
◆ debug_fix_space_level
int tesseract::Tesseract::debug_fix_space_level = 0 |
◆ debug_noise_removal
int tesseract::Tesseract::debug_noise_removal = 0 |
◆ debug_x_ht_level
int tesseract::Tesseract::debug_x_ht_level = 0 |
◆ enable_noise_removal
bool tesseract::Tesseract::enable_noise_removal = true |
"Remove and conditionally reassign small outlines when they" " confuse layout analysis, determining diacritics vs noise"
Definition at line 858 of file tesseractclass.h.
◆ file_type
char* tesseract::Tesseract::file_type = ".tif" |
◆ fixsp_done_mode
int tesseract::Tesseract::fixsp_done_mode = 1 |
◆ fixsp_non_noise_limit
int tesseract::Tesseract::fixsp_non_noise_limit = 1 |
◆ fixsp_small_outlines_size
double tesseract::Tesseract::fixsp_small_outlines_size = 0.28 |
◆ hocr_char_boxes
bool tesseract::Tesseract::hocr_char_boxes = false |
"Add coordinates for each character to hocr output"
Definition at line 935 of file tesseractclass.h.
◆ hocr_font_info
bool tesseract::Tesseract::hocr_font_info = false |
◆ interactive_display_mode
bool tesseract::Tesseract::interactive_display_mode = false |
◆ jpg_quality
int tesseract::Tesseract::jpg_quality = 85 |
◆ lstm_choice_mode
int tesseract::Tesseract::lstm_choice_mode = 0 |
"Allows to include alternative symbols choices in the hOCR " "output. " "Valid input values are 0, 1, 2 and 3. 0 is the default value. " "With 1 the alternative symbol choices per timestep are included. " "With 2 the alternative symbol choices are accumulated per " "character. "
Definition at line 1087 of file tesseractclass.h.
◆ lstm_use_matrix
bool tesseract::Tesseract::lstm_use_matrix = 1 |
◆ min_characters_to_try
int tesseract::Tesseract::min_characters_to_try = 50 |
◆ min_orientation_margin
double tesseract::Tesseract::min_orientation_margin = 7.0 |
◆ min_sane_x_ht_pixels
int tesseract::Tesseract::min_sane_x_ht_pixels = 8 |
◆ multilang_debug_level
int tesseract::Tesseract::multilang_debug_level = 0 |
◆ noise_cert_basechar
double tesseract::Tesseract::noise_cert_basechar = -8.0 |
◆ noise_cert_disjoint
double tesseract::Tesseract::noise_cert_disjoint = -2.5 |
◆ noise_cert_factor
double tesseract::Tesseract::noise_cert_factor = 0.375 |
◆ noise_cert_punc
double tesseract::Tesseract::noise_cert_punc = -2.5 |
◆ noise_maxperblob
int tesseract::Tesseract::noise_maxperblob = 8 |
◆ noise_maxperword
int tesseract::Tesseract::noise_maxperword = 16 |
◆ numeric_punctuation
char* tesseract::Tesseract::numeric_punctuation = ".," |
◆ ocr_devanagari_split_strategy
"Whether to use the top-line splitting process for Devanagari " "documents while performing ocr."
Definition at line 819 of file tesseractclass.h.
◆ ok_repeated_ch_non_alphanum_wds
char* tesseract::Tesseract::ok_repeated_ch_non_alphanum_wds = "-?*\075" |
◆ outlines_2
char* tesseract::Tesseract::outlines_2 = "ij!?%\":;" |
◆ outlines_odd
char* tesseract::Tesseract::outlines_odd = "%| " |
◆ page_separator
char* tesseract::Tesseract::page_separator = "\f" |
"Page separator (default is form feed control character)"
Definition at line 1080 of file tesseractclass.h.
◆ pageseg_apply_music_mask
bool tesseract::Tesseract::pageseg_apply_music_mask = true |
"Detect music staff and remove intersecting components"
Definition at line 1089 of file tesseractclass.h.
◆ pageseg_devanagari_split_strategy
"Whether to use the top-line splitting process for Devanagari " "documents while performing page-segmentation."
Definition at line 815 of file tesseractclass.h.
◆ paragraph_debug_level
int tesseract::Tesseract::paragraph_debug_level = 0 |
◆ paragraph_text_based
bool tesseract::Tesseract::paragraph_text_based = true |
"Run paragraph detection on the post-text-recognition " "(more accurate)"
Definition at line 896 of file tesseractclass.h.
◆ poly_allow_detailed_fx
bool tesseract::Tesseract::poly_allow_detailed_fx = false |
"Allow feature extractors to see the original outline"
Definition at line 1063 of file tesseractclass.h.
◆ preserve_interword_spaces
bool tesseract::Tesseract::preserve_interword_spaces = false |
◆ quality_blob_pc
double tesseract::Tesseract::quality_blob_pc = 0.0 |
◆ quality_char_pc
double tesseract::Tesseract::quality_char_pc = 0.95 |
◆ quality_min_initial_alphas_reqd
int tesseract::Tesseract::quality_min_initial_alphas_reqd = 2 |
◆ quality_outline_pc
double tesseract::Tesseract::quality_outline_pc = 1.0 |
◆ quality_rej_pc
double tesseract::Tesseract::quality_rej_pc = 0.08 |
◆ quality_rowrej_pc
double tesseract::Tesseract::quality_rowrej_pc = 1.1 |
◆ rej_1Il_trust_permuter_type
bool tesseract::Tesseract::rej_1Il_trust_permuter_type = true |
◆ rej_1Il_use_dict_word
bool tesseract::Tesseract::rej_1Il_use_dict_word = false |
◆ rej_alphas_in_number_perm
bool tesseract::Tesseract::rej_alphas_in_number_perm = false |
◆ rej_trust_doc_dawg
bool tesseract::Tesseract::rej_trust_doc_dawg = false |
◆ rej_use_good_perm
bool tesseract::Tesseract::rej_use_good_perm = true |
◆ rej_use_sensible_wd
bool tesseract::Tesseract::rej_use_sensible_wd = false |
◆ rej_use_tess_accepted
bool tesseract::Tesseract::rej_use_tess_accepted = true |
◆ rej_use_tess_blanks
bool tesseract::Tesseract::rej_use_tess_blanks = true |
◆ rej_whole_of_mostly_reject_word_fract
double tesseract::Tesseract::rej_whole_of_mostly_reject_word_fract = 0.85 |
◆ subscript_max_y_top
double tesseract::Tesseract::subscript_max_y_top = 0.5 |
"Maximum top of a character measured as a multiple of x-height " "above the baseline for us to reconsider whether it's a " "subscript."
Definition at line 991 of file tesseractclass.h.
◆ superscript_bettered_certainty
double tesseract::Tesseract::superscript_bettered_certainty = 0.97 |
"What reduction in " "badness do we think sufficient to choose a superscript over " "what we'd thought. For example, a value of 0.6 means we want " "to reduce badness of certainty by 40%"
Definition at line 983 of file tesseractclass.h.
◆ superscript_debug
int tesseract::Tesseract::superscript_debug = 0 |
◆ superscript_min_y_bottom
double tesseract::Tesseract::superscript_min_y_bottom = 0.3 |
"Minimum bottom of a character measured as a multiple of " "x-height above the baseline for us to reconsider whether it's " "a superscript."
Definition at line 995 of file tesseractclass.h.
◆ superscript_scaledown_ratio
double tesseract::Tesseract::superscript_scaledown_ratio = 0.4 |
"A superscript scaled down more than this is unbelievably " "small. For example, 0.3 means we expect the font size to " "be no smaller than 30% of the text line font size."
Definition at line 987 of file tesseractclass.h.
◆ superscript_worse_certainty
double tesseract::Tesseract::superscript_worse_certainty = 2.0 |
"How many times worse " "certainty does a superscript position glyph need to be for us " "to try classifying it as a char with a different baseline?"
Definition at line 978 of file tesseractclass.h.
◆ suspect_accept_rating
double tesseract::Tesseract::suspect_accept_rating = -999.9 |
◆ suspect_constrain_1Il
bool tesseract::Tesseract::suspect_constrain_1Il = false |
◆ suspect_level
int tesseract::Tesseract::suspect_level = 99 |
◆ suspect_rating_per_ch
double tesseract::Tesseract::suspect_rating_per_ch = 999.9 |
◆ suspect_short_words
int tesseract::Tesseract::suspect_short_words = 2 |
◆ tessedit_adaption_debug
bool tesseract::Tesseract::tessedit_adaption_debug = false |
"Generate and print debug information for adaption"
Definition at line 823 of file tesseractclass.h.
◆ tessedit_ambigs_training
bool tesseract::Tesseract::tessedit_ambigs_training = false |
◆ tessedit_bigram_debug
int tesseract::Tesseract::tessedit_bigram_debug = 0 |
"Amount of debug output for bigram " "correction."
Definition at line 855 of file tesseractclass.h.
◆ tessedit_char_blacklist
char* tesseract::Tesseract::tessedit_char_blacklist = "" |
◆ tessedit_char_unblacklist
char* tesseract::Tesseract::tessedit_char_unblacklist = "" |
"List of chars to override tessedit_char_blacklist"
Definition at line 809 of file tesseractclass.h.
◆ tessedit_char_whitelist
char* tesseract::Tesseract::tessedit_char_whitelist = "" |
◆ tessedit_create_alto
bool tesseract::Tesseract::tessedit_create_alto = false |
◆ tessedit_create_boxfile
bool tesseract::Tesseract::tessedit_create_boxfile = false |
◆ tessedit_create_hocr
bool tesseract::Tesseract::tessedit_create_hocr = false |
◆ tessedit_create_lstmbox
bool tesseract::Tesseract::tessedit_create_lstmbox = false |
◆ tessedit_create_pdf
bool tesseract::Tesseract::tessedit_create_pdf = false |
◆ tessedit_create_tsv
bool tesseract::Tesseract::tessedit_create_tsv = false |
◆ tessedit_create_txt
bool tesseract::Tesseract::tessedit_create_txt = false |
◆ tessedit_create_wordstrbox
bool tesseract::Tesseract::tessedit_create_wordstrbox = false |
◆ tessedit_debug_block_rejection
bool tesseract::Tesseract::tessedit_debug_block_rejection = false |
◆ tessedit_debug_doc_rejection
bool tesseract::Tesseract::tessedit_debug_doc_rejection = false |
◆ tessedit_debug_fonts
bool tesseract::Tesseract::tessedit_debug_fonts = false |
◆ tessedit_debug_quality_metrics
bool tesseract::Tesseract::tessedit_debug_quality_metrics = false |
◆ tessedit_display_outwords
bool tesseract::Tesseract::tessedit_display_outwords = false |
◆ tessedit_do_invert
bool tesseract::Tesseract::tessedit_do_invert = true |
"Try inverting the image in `LSTMRecognizeWord`"
Definition at line 797 of file tesseractclass.h.
◆ tessedit_dont_blkrej_good_wds
bool tesseract::Tesseract::tessedit_dont_blkrej_good_wds = false |
◆ tessedit_dont_rowrej_good_wds
bool tesseract::Tesseract::tessedit_dont_rowrej_good_wds = false |
◆ tessedit_dump_choices
bool tesseract::Tesseract::tessedit_dump_choices = false |
◆ tessedit_dump_pageseg_images
bool tesseract::Tesseract::tessedit_dump_pageseg_images = false |
"Dump intermediate images made during page segmentation"
Definition at line 795 of file tesseractclass.h.
◆ tessedit_enable_bigram_correction
bool tesseract::Tesseract::tessedit_enable_bigram_correction = true |
"Enable correction based on the word bigram dictionary."
Definition at line 850 of file tesseractclass.h.
◆ tessedit_enable_dict_correction
bool tesseract::Tesseract::tessedit_enable_dict_correction = false |
"Enable single word correction based on the dictionary."
Definition at line 852 of file tesseractclass.h.
◆ tessedit_enable_doc_dict
bool tesseract::Tesseract::tessedit_enable_doc_dict = true |
◆ tessedit_fix_fuzzy_spaces
bool tesseract::Tesseract::tessedit_fix_fuzzy_spaces = true |
◆ tessedit_fix_hyphens
bool tesseract::Tesseract::tessedit_fix_hyphens = true |
◆ tessedit_flip_0O
bool tesseract::Tesseract::tessedit_flip_0O = true |
◆ tessedit_good_doc_still_rowrej_wd
double tesseract::Tesseract::tessedit_good_doc_still_rowrej_wd = 1.1 |
"rej good doc wd if more than this fraction rejected"
Definition at line 925 of file tesseractclass.h.
◆ tessedit_good_quality_unrej
bool tesseract::Tesseract::tessedit_good_quality_unrej = true |
◆ tessedit_image_border
int tesseract::Tesseract::tessedit_image_border = 2 |
◆ tessedit_init_config_only
bool tesseract::Tesseract::tessedit_init_config_only = false |
"Only initialize with the config file. Useful if the instance is " "not going to be used for OCR but say only for layout analysis."
Definition at line 1066 of file tesseractclass.h.
◆ tessedit_load_sublangs
char* tesseract::Tesseract::tessedit_load_sublangs = "" |
◆ tessedit_lower_flip_hyphen
double tesseract::Tesseract::tessedit_lower_flip_hyphen = 1.5 |
◆ tessedit_make_boxes_from_boxes
bool tesseract::Tesseract::tessedit_make_boxes_from_boxes = false |
◆ tessedit_minimal_rej_pass1
bool tesseract::Tesseract::tessedit_minimal_rej_pass1 = false |
◆ tessedit_minimal_rejection
bool tesseract::Tesseract::tessedit_minimal_rejection = false |
◆ tessedit_ocr_engine_mode
"Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults" " to loading and running the most accurate available."
Definition at line 804 of file tesseractclass.h.
◆ tessedit_override_permuter
bool tesseract::Tesseract::tessedit_override_permuter = true |
◆ tessedit_page_number
int tesseract::Tesseract::tessedit_page_number = -1 |
◆ tessedit_pageseg_mode
"Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," " 5=line, 6=word, 7=char" " (Values from PageSegMode enum in publictypes.h)"
Definition at line 801 of file tesseractclass.h.
◆ tessedit_parallelize
int tesseract::Tesseract::tessedit_parallelize = 0 |
◆ tessedit_prefer_joined_punct
bool tesseract::Tesseract::tessedit_prefer_joined_punct = false |
◆ tessedit_preserve_blk_rej_perfect_wds
bool tesseract::Tesseract::tessedit_preserve_blk_rej_perfect_wds = true |
"Only rej partially rejected words in block rejection"
Definition at line 913 of file tesseractclass.h.
◆ tessedit_preserve_min_wd_len
int tesseract::Tesseract::tessedit_preserve_min_wd_len = 2 |
◆ tessedit_preserve_row_rej_perfect_wds
bool tesseract::Tesseract::tessedit_preserve_row_rej_perfect_wds = true |
"Only rej partially rejected words in row rejection"
Definition at line 915 of file tesseractclass.h.
◆ tessedit_reject_bad_qual_wds
bool tesseract::Tesseract::tessedit_reject_bad_qual_wds = true |
◆ tessedit_reject_block_percent
double tesseract::Tesseract::tessedit_reject_block_percent = 45.00 |
◆ tessedit_reject_doc_percent
double tesseract::Tesseract::tessedit_reject_doc_percent = 65.00 |
◆ tessedit_reject_mode
int tesseract::Tesseract::tessedit_reject_mode = 0 |
◆ tessedit_reject_row_percent
double tesseract::Tesseract::tessedit_reject_row_percent = 40.00 |
◆ tessedit_rejection_debug
bool tesseract::Tesseract::tessedit_rejection_debug = false |
◆ tessedit_resegment_from_boxes
bool tesseract::Tesseract::tessedit_resegment_from_boxes = false |
"Take segmentation and labeling from box file"
Definition at line 785 of file tesseractclass.h.
◆ tessedit_resegment_from_line_boxes
bool tesseract::Tesseract::tessedit_resegment_from_line_boxes = false |
"Conversion of word/line box file to char box file"
Definition at line 787 of file tesseractclass.h.
◆ tessedit_row_rej_good_docs
bool tesseract::Tesseract::tessedit_row_rej_good_docs = true |
◆ tessedit_tess_adaption_mode
int tesseract::Tesseract::tessedit_tess_adaption_mode = 0x27 |
◆ tessedit_test_adaption
bool tesseract::Tesseract::tessedit_test_adaption = false |
◆ tessedit_timing_debug
bool tesseract::Tesseract::tessedit_timing_debug = false |
◆ tessedit_train_from_boxes
bool tesseract::Tesseract::tessedit_train_from_boxes = false |
◆ tessedit_train_line_recognizer
bool tesseract::Tesseract::tessedit_train_line_recognizer = false |
"Break input into lines and remap boxes if present"
Definition at line 793 of file tesseractclass.h.
◆ tessedit_unrej_any_wd
bool tesseract::Tesseract::tessedit_unrej_any_wd = false |
◆ tessedit_upper_flip_hyphen
double tesseract::Tesseract::tessedit_upper_flip_hyphen = 1.8 |
◆ tessedit_use_primary_params_model
bool tesseract::Tesseract::tessedit_use_primary_params_model = false |
"In multilingual mode use params model of the primary language"
Definition at line 1055 of file tesseractclass.h.
◆ tessedit_use_reject_spaces
bool tesseract::Tesseract::tessedit_use_reject_spaces = true |
◆ tessedit_whole_wd_rej_row_percent
double tesseract::Tesseract::tessedit_whole_wd_rej_row_percent = 70.00 |
"Number of row rejects in whole word rejects" "which prevents whole row rejection"
Definition at line 911 of file tesseractclass.h.
◆ tessedit_word_for_word
bool tesseract::Tesseract::tessedit_word_for_word = false |
◆ tessedit_write_block_separators
bool tesseract::Tesseract::tessedit_write_block_separators = false |
◆ tessedit_write_images
bool tesseract::Tesseract::tessedit_write_images = false |
◆ tessedit_write_params_to_file
char* tesseract::Tesseract::tessedit_write_params_to_file = "" |
◆ tessedit_write_rep_codes
bool tesseract::Tesseract::tessedit_write_rep_codes = false |
◆ tessedit_write_unlv
bool tesseract::Tesseract::tessedit_write_unlv = false |
◆ tessedit_zero_kelvin_rejection
bool tesseract::Tesseract::tessedit_zero_kelvin_rejection = false |
◆ tessedit_zero_rejection
bool tesseract::Tesseract::tessedit_zero_rejection = false |
◆ test_pt
bool tesseract::Tesseract::test_pt = false |
◆ test_pt_x
double tesseract::Tesseract::test_pt_x = 99999.99 |
◆ test_pt_y
double tesseract::Tesseract::test_pt_y = 99999.99 |
◆ textonly_pdf
bool tesseract::Tesseract::textonly_pdf = false |
◆ textord_equation_detect
bool tesseract::Tesseract::textord_equation_detect = false |
◆ textord_tabfind_aligned_gap_fraction
double tesseract::Tesseract::textord_tabfind_aligned_gap_fraction = 0.75 |
"Fraction of height used as a minimum gap for aligned blobs."
Definition at line 1075 of file tesseractclass.h.
◆ textord_tabfind_force_vertical_text
bool tesseract::Tesseract::textord_tabfind_force_vertical_text = false |
◆ textord_tabfind_show_vlines
bool tesseract::Tesseract::textord_tabfind_show_vlines = false |
◆ textord_tabfind_vertical_text
bool tesseract::Tesseract::textord_tabfind_vertical_text = true |
◆ textord_tabfind_vertical_text_ratio
double tesseract::Tesseract::textord_tabfind_vertical_text_ratio = 0.5 |
"Fraction of textlines deemed vertical to use vertical page " "mode"
Definition at line 1073 of file tesseractclass.h.
◆ textord_use_cjk_fp_model
bool tesseract::Tesseract::textord_use_cjk_fp_model = false |
◆ unlv_tilde_crunching
bool tesseract::Tesseract::unlv_tilde_crunching = false |
◆ unrecognised_char
char* tesseract::Tesseract::unrecognised_char = "|" |
◆ user_defined_dpi
int tesseract::Tesseract::user_defined_dpi = 0 |
◆ x_ht_acceptance_tolerance
int tesseract::Tesseract::x_ht_acceptance_tolerance = 8 |
"Max allowed deviation of blob top outside of font data"
Definition at line 972 of file tesseractclass.h.
◆ x_ht_min_change
int tesseract::Tesseract::x_ht_min_change = 8 |
The documentation for this class was generated from the following files:
int IntCastRounded(double x)
bool dangerous_ambig_found() const
bool one_ell_conflict(WERD_RES *word_res, bool update_map)
void init_to_size(int size, const T &t)
uint32_t unsigned_size() const
bool tilde_crunch_written
int os_detect_blobs(const GenericVector< int > *allowed_scripts, BLOBNBOX_CLIST *blob_list, OSResults *osr, tesseract::Tesseract *tess)
bool tessedit_enable_doc_dict
TBOX bounding_box() const
bool Split(bool split_for_pageseg, DebugPixa *pixa_debug)
bool AdaptiveClassifierIsFull() const
bool last_char_was_newline
double crunch_pot_poor_cert
int tessedit_preserve_min_wd_len
WERD_CHOICE * prev_word_best_choice_
int get_script_table_size() const
void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height, Pix *binary_pix, Pix *thresholds_pix, Pix *grey_pix, bool use_box_bottoms, BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
int crunch_leave_lc_strings
void set_unichar_id(UNICHAR_ID unichar_id, int index)
void script_pos_pass(PAGE_RES *page_res)
void RecognizeLine(const ImageData &image_data, bool invert, bool debug, double worst_dict_cert, const TBOX &line_box, PointerVector< WERD_RES > *words, int lstm_choice_mode=0)
char * chs_trailing_punct1
#define STRING_MEMBER(name, val, comment, vec)
@ AC_LOWER_CASE
ALL lower case.
GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word)
bool tessedit_debug_doc_rejection
bool AcceptableResult(WERD_RES *word) const
double rej_whole_of_mostly_reject_word_fract
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
@ AC_UPPER_CASE
ALL upper case.
void write_results(PAGE_RES_IT &page_res_it, char newline_type, bool force_eol)
void SetLangTesseract(Tesseract *lang_tesseract)
UnicityTable< FontInfo > & get_fontinfo_table()
int crunch_leave_uc_strings
void ResetAdaptiveClassifier()
double superscript_min_y_bottom
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
int textord_debug_tabfind
int state(int index) const
const char * string() const
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
void(Tesseract::*)(const WordData &, WERD_RES **, PointerVector< WERD_RES > *) WordRecognizer
@ OEM_TESSERACT_LSTM_COMBINED
bool tessedit_ambigs_training
const float kCertaintyScale
void TidyUp(PAGE_RES *page_res)
int16_t eval_word_spacing(WERD_RES_LIST &word_res_list)
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
tesseract::Tesseract * tesseract
int multilang_debug_level
bool deadline_exceeded() const
bool x_overlap(const TBOX &box) const
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
TBOX bounding_box() const
const STRING debug_string() const
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
void SetVisible(bool visible)
void read_config_file(const char *filename, SetParamConstraint constraint)
int16_t first_alphanum_index(const char *word, const char *word_lengths)
double tessedit_reject_doc_percent
bool SubAndSuperscriptFix(WERD_RES *word_res)
bool write_results_empty_block
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params)
int16_t doc_good_char_quality
WERD_CHOICE * best_choice
@ PSM_RAW_LINE
hacks that are Tesseract-specific.
int16_t fontinfo_id2() const
bool check_debug_pt(WERD_RES *word, int location)
bool textord_use_cjk_fp_model
double classify_max_rating_ratio
void InsertSeam(int blob_number, SEAM *seam)
bool word_bln_display(PAGE_RES_IT *pr_it)
STRING language_data_path_prefix
void pad(int xpad, int ypad)
TBOX intersection(const TBOX &box) const
BlamerBundle * blamer_bundle
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
bool tessedit_dont_rowrej_good_wds
bool flag(WERD_FLAGS mask) const
@ W_DONT_CHOP
fixed pitch chopped
std::unique_ptr< LanguageModel > language_model_
void PrintBestChoices() const
bool HasDifferentSplitStrategies() const
const STRING & unichar_string() const
@ SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY
bool tessedit_create_wordstrbox
char * tessedit_char_whitelist
void add_str_int(const char *str, int number)
char * ShowInputDialog(const char *msg)
const UNICHARSET * uch_set
SEAM * chop_one_blob(const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE * > &blob_choices, WERD_RES *word_res, int *blob_number)
bool crunch_leave_ok_strings
bool fixspace_thinks_word_done(WERD_RES *word)
double crunch_poor_garbage_rate
bool right_to_left() const
void Load(const STRING &lang, TessdataManager *data_file)
bool tessedit_dump_pageseg_images
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
int paragraph_debug_level
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
bool tessedit_train_from_boxes
void SetupWordPassN(int pass_n, WordData *word)
bool noise_outlines(TWERD *word)
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
int pageseg_devanagari_split_strategy
int16_t failure_count(WERD_RES *word)
constexpr int kMaxCredibleResolution
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB * > *target_blobs)
int classify_integer_matcher_multiplier
char * tessedit_char_blacklist
int get_id(T object) const
bool unlv_tilde_crunching
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
const double kMinRefitXHeightFraction
@ PSM_SINGLE_BLOCK
Assume a single uniform block of text. (Default.)
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
GenericVector< STRING > misadaption_log
const char * id_to_unichar(UNICHAR_ID id) const
bool tessedit_row_rej_good_docs
bool applybox_learn_chars_and_char_frags_mode
bool major_x_overlap(const TBOX &box) const
double crunch_del_low_word
#define LOC_WRITE_RESULTS
C_BLOB_LIST * cblob_list()
bool tessedit_enable_dict_correction
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
bool tessedit_fix_fuzzy_spaces
double textord_tabfind_aligned_gap_fraction
bool tessedit_display_outwords
bool get_isalpha(UNICHAR_ID unichar_id) const
bool textord_tabfind_vertical_text
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
#define INT_INIT_MEMBER(name, val, comment, vec)
bool PSM_COL_FIND_ENABLED(int pageseg_mode)
bool tessedit_debug_quality_metrics
bool word_display(PAGE_RES_IT *pr_it)
bool word_set_display(PAGE_RES_IT *pr_it)
bool tessedit_train_line_recognizer
SVEvent * AwaitEvent(SVEventType type)
double tessedit_reject_row_percent
int debug_fix_space_level
@ SHOW_SUPERSCRIPT_CMD_EVENT
static const double kXHeightCapRatio
void set_pageseg_split_strategy(SplitStrategy strategy)
bool assume_fixed_pitch_char_segment
void set_segmentation_block_list(BLOCK_LIST *block_list)
void set_flag(WERD_FLAGS mask, bool value)
GenericVector< TBLOB * > blobs
@ SET_PARAM_CONSTRAINT_DEBUG_ONLY
void set_global_loc_code(int loc_code)
#define INT_MEMBER(name, val, comment, vec)
bool PSM_SPARSE(int pageseg_mode)
@ AC_UNACCEPTABLE
Unacceptable word.
void AddMessage(const char *format,...)
double superscript_worse_certainty
bool get_isdigit(UNICHAR_ID unichar_id) const
void SetScriptPositions()
void ConsumeWordResults(WERD_RES *word)
void set_certainty(float new_val)
bool get_islower(UNICHAR_ID unichar_id) const
STRING debug_str(UNICHAR_ID id) const
@ W_REP_CHAR
repeated character
@ PSM_SINGLE_WORD
Treat the image as a single word.
bool wordrec_debug_blamer
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
int ocr_devanagari_split_strategy
int init_tesseract_internal(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
BLOB_CHOICE * GetBlobChoice(int index) const
WERD_RES * next_word() const
char * numeric_punctuation
STRING TruthString() const
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
UNICHAR_ID unichar_id(int index) const
char * tessedit_load_sublangs
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
void recog_word_recursive(WERD_RES *word)
void tess_segment_pass_n(int pass_n, WERD_RES *word)
const char * get_script_from_script_id(int id) const
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
int LabelSpecialText(TO_BLOCK *to_block) override
DLLSYM void tprintf(const char *format,...)
const T & get(int id) const
Return the object from an id.
void set_ocr_split_strategy(SplitStrategy strategy)
bool tessedit_prefer_joined_punct
bool crunch_terrible_garbage
@ PSM_SINGLE_BLOCK_VERT_TEXT
aligned text.
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
double tessedit_lower_flip_hyphen
void turn_on_bit(uint8_t bit_num)
#define BOOL_INIT_MEMBER(name, val, comment, vec)
void DebugWordChoices(bool debug, const char *word_to_debug)
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
bool poly_allow_detailed_fx
bool tessedit_create_lstmbox
void reject_I_1_L(WERD_RES *word)
bool interactive_display_mode
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
int16_t safe_dict_word(const WERD_RES *werd_res)
bool tessedit_zero_rejection
void Text(int x, int y, const char *mystring)
void insert(const T &t, int index)
void ZoomToRectangle(int x1, int y1, int x2, int y2)
void debug_word(PAGE_RES *page_res, const TBOX &selection_box)
@ SHOW_UNDERLINE_CMD_EVENT
bool tessedit_zero_kelvin_rejection
bool tessedit_write_images
void BuildMenu(ScrollView *sv, bool menu_bar=true)
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
bool tessedit_rejection_debug
BLOCK_RES * block() const
void BestChoiceToCorrectText()
ROW_LIST * row_list()
get rows
bool tess_acceptable_word(WERD_RES *word)
const int16_t kMaxBoxEdgeDiff
static int SortByXMiddle(const void *v1, const void *v2)
WERD_CHOICE_LIST best_choices
void TrainFromBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
void * cancel_this
monitor-aware progress callback
GenericVector< int > blame_reasons
const GenericVector< tesseract::ScoredFont > & fonts() const
double noise_cert_basechar
double superscript_bettered_certainty
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX *next_box, const char *correct_text)
int DivRounded(int a, int b)
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
const double kMaxXHeightDeviationFraction
void set_rating(float newrat)
bool digit_or_numeric_punct(WERD_RES *word, int char_position)
const STRING & unichar_lengths() const
bool tessedit_write_block_separators
int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
bool AdaptableWord(WERD_RES *word)
bool tessedit_good_quality_unrej
double tessedit_upper_flip_hyphen
double subscript_max_y_top
bool acceptable_number_string(const char *s, const char *lengths)
static WERD_RES * deep_copy(const WERD_RES *src)
void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box)
char * ok_repeated_ch_non_alphanum_wds
int8_t fontinfo_id2_count
bool recog_interactive(PAGE_RES_IT *pr_it)
bool suspect_constrain_1Il
void PreenXHeights(BLOCK_LIST *block_list)
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
int16_t word_outline_errs(WERD_RES *word)
@ SHOW_BLN_WERD_CMD_EVENT
UnicharAmbigs unichar_ambigs
PDBLK pdblk
Page Description Block.
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
void SetupUniversalFontIds()
bool tessedit_debug_block_rejection
bool contains(char c) const
void ReSegmentByClassification(PAGE_RES *page_res)
bool crunch_leave_accept_strings
void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs)
int classify_class_pruner_multiplier
bool stopper_no_acceptable_choices
void do_re_display(bool(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
#define BOOL_MEMBER(name, val, comment, vec)
void tilde_delete(PAGE_RES_IT &page_res_it)
void SwitchAdaptiveClassifier()
static void LastChanceBlame(bool debug, WERD_RES *word)
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE * > &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
bool crunch_early_merge_tess_fails
static C_BLOB * deep_copy(const C_BLOB *src)
SVMenuNode * build_menu_new()
void SetupFake(const UNICHARSET &uch)
ImageData * GetLineData(const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)
void MakeCurrentWordFuzzy()
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
int tessedit_bigram_debug
void Rectangle(int x1, int y1, int x2, int y2)
POLY_BLOCK * poly_block() const
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
void process_selected_words(PAGE_RES *page_res, TBOX &selection_box, bool(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
bool use_ambigs_for_adaption
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
bool paragraph_text_based
double suspect_accept_rating
bool wordrec_enable_assoc
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE * > &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
int tessedit_pageseg_mode
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
@ AC_INITIAL_CAP
ALL but initial lc.
void InitForRetryRecognition(const WERD_RES &source)
void set_orig_pix(Pix *pix)
volatile int8_t ocr_alive
true if not last
bool tessedit_override_permuter
void set_unlv_suspects(WERD_RES *word)
int crunch_long_repetitions
void set_y(int16_t yin)
rewrite function
int fixsp_non_noise_limit
int push_back(T object)
Add an element in the table.
void move(const ICOORD vec)
void ReplaceBestChoice(WERD_CHOICE *choice)
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
char * chs_trailing_punct2
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
bool tessedit_resegment_from_boxes
@ DF_POLYGONAL
Polyg approx.
void SearchWords(PointerVector< WERD_RES > *words)
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
double superscript_scaledown_ratio
bool tessedit_debug_fonts
void AcceptIfGoodQuality(int index)
static Pix * FindImages(Pix *pix, DebugPixa *pixa_debug)
bool tessedit_resegment_from_line_boxes
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
void ResetDocumentDictionary()
bool rej_alphas_in_number_perm
bool classify_bln_numeric_mode
const ICOORD & botleft() const
#define double_MEMBER(name, val, comment, vec)
bool preserve_interword_spaces
TBOX bounding_box() const
int tessedit_image_border
void StartBackupAdaptiveClassifier()
@ SHOW_FIXEDPITCH_CMD_EVENT
void bigram_correction_pass(PAGE_RES *page_res)
const TBOX & BlobBox(int index) const
bool Load(const ParamsVectors *params, const char *lang, TessdataManager *mgr)
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
bool word_dumper(PAGE_RES_IT *pr_it)
void set_word_fonts(WERD_RES *word)
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE * > &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB * > *target_blobs)
int16_t count_alphanums(const WERD_CHOICE &word)
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
bool AddSelectedOutlines(const GenericVector< bool > &wanted, const GenericVector< C_BLOB * > &target_blobs, const GenericVector< C_OUTLINE * > &outlines, bool *make_next_word_fuzzy)
void flip_0O(WERD_RES *word)
@ W_FUZZY_NON
fuzzy nonspace
float max_x_height() const
int dict_word(const WERD_CHOICE &word)
WERD_RES * restart_page()
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX *next_box, const char *correct_text)
bool tessedit_use_primary_params_model
bool rej_1Il_trust_permuter_type
void set_permuter(uint8_t perm)
bool tessedit_write_rep_codes
bool tessedit_create_boxfile
@ DF_EDGE_STEP
Edge steps.
bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode)
const float kWorstDictCertainty
constexpr int kMinCredibleResolution
SVMenuNode * AddChild(const char *txt)
void recog_word(WERD_RES *word)
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
bool tessedit_preserve_blk_rej_perfect_wds
bool tessedit_init_config_only
void blamer_pass(PAGE_RES *page_res)
void AddPix(const Pix *pix, const char *caption)
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
bool tessedit_minimal_rej_pass1
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
void initialise(int16_t length)
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box)
bool tessedit_minimal_rejection
void GetNoiseOutlines(GenericVector< C_OUTLINE * > *outlines)
bool word_blank_and_set_display(PAGE_RES_IT *pr_its)
void SetupForLoad(DawgCache *dawg_cache)
const int kBlnBaselineOffset
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
void fix_rep_char(PAGE_RES_IT *page_res_it)
bool tessedit_test_adaption
void rotate(const FCOORD &vec)
@ SHOW_SMALLCAPS_CMD_EVENT
void ResetAdaptiveClassifierInternal()
bool word_contains_non_1_digit(const char *word, const char *word_lengths)
double tessedit_reject_block_percent
void find_components(Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
int tessedit_tess_adaption_mode
double quality_outline_pc
float min_x_height() const
int x_ht_acceptance_tolerance
bool PSM_ORIENTATION_ENABLED(int pageseg_mode)
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
void transform_to_next_perm(WERD_RES_LIST &words)
tesseract::BoxWord * bln_boxes
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
bool tessedit_timing_debug
int16_t alpha_count(const char *word, const char *word_lengths)
bool tessedit_enable_bigram_correction
const UNICHARSET * unicharset() const
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
bool major_overlap(const TBOX &box) const
void set_certainty(float newrat)
int16_t worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
void Image(struct Pix *image, int x_pos, int y_pos)
void make_bad()
Set the fields in this choice to be default (bad) values.
void full_print(FILE *fp)
void rej_word_bad_quality()
void SetAllScriptPositions(tesseract::ScriptPos position)
const ICOORD & topright() const
char determine_newline_type(WERD *word, BLOCK *block, WERD *next_word, BLOCK *next_block)
C_BLOB_LIST * rej_cblob_list()
static ScrollView::Color NextColor(ScrollView::Color colour)
char * applybox_exposure_pattern
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
double tessedit_good_doc_still_rowrej_wd
void set_blanks(uint8_t new_blanks)
int quality_min_initial_alphas_reqd
void set_x(int16_t xin)
rewrite function
@ SHOW_DROPCAPS_CMD_EVENT
bool script_has_xheight() const
int tessedit_ocr_engine_mode
bool contains(const FCOORD pt) const
bool crunch_early_convert_bad_unlv_chs
bool rej_1Il_use_dict_word
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
double fixsp_small_outlines_size
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
PROGRESS_FUNC2 progress_callback2
called whenever progress increases
void TextAttributes(const char *font, int pixel_size, bool bold, bool italic, bool underlined)
@ PSM_CIRCLE_WORD
Treat the image as a single word in a circle.
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
float base_line(float xpos) const
WERD_CHOICE shallow_copy(int start, int end) const
int editor_image_word_bb_color
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
IncorrectResultReason incorrect_result_reason() const
double crunch_poor_garbage_cert
bool tessedit_reject_bad_qual_wds
double noise_cert_disjoint
@ SHOW_SUBSCRIPT_CMD_EVENT
bool tessedit_create_alto
bool AdaptiveClassifierIsEmpty() const
@ PSM_OSD_ONLY
Orientation and script detection only.
tesseract::BoxWord * box_word
bool word_adaptable(WERD_RES *word, uint16_t mode)
void convert_bad_unlv_chs(WERD_RES *word_res)
float blob_noise_score(TBLOB *blob)
bool tessedit_adaption_debug
void reject_mostly_rejects(WERD_RES *word)
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
void fix_fuzzy_spaces(ETEXT_DESC *monitor, int32_t word_count, PAGE_RES *page_res)
void set_global_subloc_code(int loc_code)
void make_reject_map(WERD_RES *word, ROW *row, int16_t pass)
TBOX bounding_box() const
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
double min_orientation_margin
bool load_from_file(const char *const filename, bool skip_fragments)
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
double crunch_del_high_word
bool tessedit_preserve_row_rej_perfect_wds
void AddEventHandler(SVEventHandler *listener)
Add an Event Listener to this ScrollView Window.
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
bool major_right_to_left() const
int size() const
Return the size used.
bool crunch_include_numerals
const char *const kBackUpConfigFile
int16_t fontinfo_id() const
char * tessedit_write_params_to_file
@ SET_PARAM_CONSTRAINT_NONE
bool textord_equation_detect
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
@ DF_BN_POLYGONAL
BL normalisd polyapx.
bool non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
bool tessedit_fix_hyphens
void CopyFrom(const UNICHARSET &src)
bool tessedit_word_for_word
const UNICHARSET & GetUnicharset() const
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
static TESS_API DawgCache * GlobalDawgCache()
void dump_words(WERD_RES_LIST &perm, int16_t score, int16_t mode, bool improved)
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
bool top_bottom_useful() const
void LearnWord(const char *fontname, WERD_RES *word)
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug)
FCOORD classify_rotation() const
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
const Dict * GetDict() const
void WritePDF(const char *filename)
bool tessedit_dump_choices
bool get_ispunctuation(UNICHAR_ID unichar_id) const
void tess_add_doc_word(WERD_CHOICE *word_choice)
BLOCK_RES * next_block() const
int16_t count_alphas(const WERD_CHOICE &word)
void CloneChoppedToRebuild()
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
double crunch_terrible_rating
int editor_image_blob_bb_color
char * tessedit_char_unblacklist
double crunch_pot_poor_rate
void extract_edges(Pix *pix, BLOCK *block)
static const char * IncorrectReasonName(IncorrectResultReason irr)
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
void PrerecAllWordsPar(const GenericVector< WordData > &words)
void set_x_height(float new_xheight)
void plot(ScrollView *window)
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
void turn_off_bit(uint8_t bit_num)
double textord_tabfind_vertical_text_ratio
UnicityTable< FontInfo > fontinfo_table_
int16_t count_outline_errs(char c, int16_t outline_count)
bool tessedit_create_hocr
double tessedit_whole_wd_rej_row_percent
int CountMisfitTops(WERD_RES *word_res)
FCOORD re_rotation() const
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
const FontInfo * fontinfo2
int16_t first_alphanum_offset(const char *word, const char *word_lengths)
bool tessedit_dont_blkrej_good_wds
BLOCK_RES_LIST block_res_list
int min_characters_to_try
bool get_isupper(UNICHAR_ID unichar_id) const
const FontInfo * fontinfo
void break_noisiest_blob_word(WERD_RES_LIST &words)
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
double suspect_rating_per_ch
bool textord_tabfind_show_vlines
void rej_word_tess_failure()
bool ContainsAnyNonSpaceDelimited() const
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
int16_t fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
int get_script_id_from_name(const char *script_name) const
bool PSM_OSD_ENABLED(int pageseg_mode)
double crunch_small_outlines_size
constexpr int kResolutionEstimationFactor
@ DF_BLAMER
Blamer information.
int crunch_pot_indicators
CRUNCH_MODE unlv_crunch_mode
bool pageseg_apply_music_mask
double crunch_del_min_width
bool bit(uint8_t bit_num) const
bool tessedit_unrej_any_wd
Dict & getDict() override
void dictionary_correction_pass(PAGE_RES *page_res)
void print(FILE *fp, bool dump)
dump whole table
CANCEL_FUNC cancel
for errcode use
C_OUTLINE_LIST * out_list()
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
const char * ScriptPosToString(enum ScriptPos script_pos)
GenericVector< int > best_state
bool applybox_learn_ngrams_mode
const char * c_str() const
void MergeAdjacentBlobs(int index)
bool rej_use_tess_accepted
void font_recognition_pass(PAGE_RES *page_res)
void GetNonSuperscriptSpan(int *start, int *end) const
int16_t progress
chars in this buffer(0)
void unrej_good_chs(WERD_RES *word, ROW *row)
bool tessedit_use_reject_spaces
bool enable_noise_removal
bool tessedit_make_boxes_from_boxes
bool textord_tabfind_force_vertical_text
void set_display_flag(uint8_t flag, bool value)
double classify_max_certainty_margin
char * conflict_set_I_l_1
void set_use_cjk_fp_model(bool flag)
void tilde_crunch(PAGE_RES_IT &page_res_it)
bool non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
int16_t word_blob_quality(WERD_RES *word, ROW *row)