tesseract  4.1.1
context.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: context.cpp (Formerly context.c)
5  * Description: Context checking functions
6  * Author: Mark Seaman, OCR Technology
7  *
8  * (c) Copyright 1990, Hewlett-Packard Company.
9  ** Licensed under the Apache License, Version 2.0 (the "License");
10  ** you may not use this file except in compliance with the License.
11  ** You may obtain a copy of the License at
12  ** http://www.apache.org/licenses/LICENSE-2.0
13  ** Unless required by applicable law or agreed to in writing, software
14  ** distributed under the License is distributed on an "AS IS" BASIS,
15  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  ** See the License for the specific language governing permissions and
17  ** limitations under the License.
18  *
19  *********************************************************************************/
20 
21 #include "dict.h"
22 #include "unicharset.h"
23 
24 namespace tesseract {
25 
26 static const int kMinAbsoluteGarbageWordLength = 10;
27 static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;
28 
29 const int case_state_table[6][4] = {
30  {/* 0. Beginning of word */
31  /* P U L D */
32  /* -1. Error on case */
33  0, 1, 5, 4},
34  {/* 1. After initial capital */
35  0, 3, 2, 4},
36  {/* 2. After lower case */
37  0, -1, 2, -1},
38  {/* 3. After upper case */
39  0, 3, -1, 4},
40  {/* 4. After a digit */
41  0, -1, -1, 4},
42  {/* 5. After initial lower case */
43  5, -1, 2, -1},
44 };
45 
46 int Dict::case_ok(const WERD_CHOICE &word) const {
47  int state = 0;
48  int x;
49  const UNICHARSET* unicharset = word.unicharset();
50  for (x = 0; x < word.length(); ++x) {
51  UNICHAR_ID ch_id = word.unichar_id(x);
52  if (unicharset->get_isupper(ch_id))
53  state = case_state_table[state][1];
54  else if (unicharset->get_islower(ch_id))
55  state = case_state_table[state][2];
56  else if (unicharset->get_isdigit(ch_id))
57  state = case_state_table[state][3];
58  else
59  state = case_state_table[state][0];
60  if (state == -1) return false;
61  }
62  return state != 5; // single lower is bad
63 }
64 
66  const UNICHARSET &unicharset) {
67  if (word.length() < kMinAbsoluteGarbageWordLength) return false;
68  int num_alphanum = 0;
69  for (int x = 0; x < word.length(); ++x) {
70  num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
71  unicharset.get_isdigit(word.unichar_id(x)));
72  }
73  return (static_cast<float>(num_alphanum) /
74  static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
75 }
76 
77 } // namespace tesseract
tesseract::case_state_table
const int case_state_table[6][4]
Definition: context.cpp:29
UNICHARSET::get_isalpha
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
tesseract
Definition: altorenderer.cpp:25
UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:498
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:34
tesseract::Dict::case_ok
int case_ok(const WERD_CHOICE &word) const
Check a string to see if it matches a set of lexical rules.
Definition: context.cpp:46
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
tesseract::Dict::absolute_garbage
bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset)
Definition: context.cpp:65
unicharset.h
WERD_CHOICE::unicharset
const UNICHARSET * unicharset() const
Definition: ratngs.h:290
dict.h
WERD_CHOICE::length
int length() const
Definition: ratngs.h:293
UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
UNICHARSET
Definition: unicharset.h:145
WERD_CHOICE
Definition: ratngs.h:263