tesseract  4.1.1
WERD_CHOICE Class Reference

#include <ratngs.h>

Inheritance diagram for WERD_CHOICE:
ELIST_LINK

Public Member Functions

 WERD_CHOICE (const UNICHARSET *unicharset)
 
 WERD_CHOICE (const UNICHARSET *unicharset, int reserved)
 
 WERD_CHOICE (const char *src_string, const char *src_lengths, float src_rating, float src_certainty, uint8_t src_permuter, const UNICHARSET &unicharset)
 
 WERD_CHOICE (const char *src_string, const UNICHARSET &unicharset)
 
 WERD_CHOICE (const WERD_CHOICE &word)
 
 ~WERD_CHOICE ()
 
const UNICHARSETunicharset () const
 
int length () const
 
float adjust_factor () const
 
void set_adjust_factor (float factor)
 
const UNICHAR_IDunichar_ids () const
 
UNICHAR_ID unichar_id (int index) const
 
int state (int index) const
 
tesseract::ScriptPos BlobPosition (int index) const
 
float rating () const
 
float certainty () const
 
float certainty (int index) const
 
float min_x_height () const
 
float max_x_height () const
 
void set_x_heights (float min_height, float max_height)
 
uint8_t permuter () const
 
const char * permuter_name () const
 
BLOB_CHOICE_LIST * blob_choices (int index, MATRIX *ratings) const
 
MATRIX_COORD MatrixCoord (int index) const
 
void set_unichar_id (UNICHAR_ID unichar_id, int index)
 
bool dangerous_ambig_found () const
 
void set_dangerous_ambig_found_ (bool value)
 
void set_rating (float new_val)
 
void set_certainty (float new_val)
 
void set_permuter (uint8_t perm)
 
void set_length (int len)
 
void double_the_size ()
 Make more space in unichar_id_ and fragment_lengths_ arrays. More...
 
void init (int reserved)
 
void init (const char *src_string, const char *src_lengths, float src_rating, float src_certainty, uint8_t src_permuter)
 
void make_bad ()
 Set the fields in this choice to be default (bad) values. More...
 
void append_unichar_id_space_allocated (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
 
void append_unichar_id (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
 
void set_unichar_id (UNICHAR_ID unichar_id, int blob_count, float rating, float certainty, int index)
 
void set_blob_choice (int index, int blob_count, const BLOB_CHOICE *blob_choice)
 
bool contains_unichar_id (UNICHAR_ID unichar_id) const
 
void remove_unichar_ids (int index, int num)
 
void remove_last_unichar_id ()
 
void remove_unichar_id (int index)
 
bool has_rtl_unichar_id () const
 
void reverse_and_mirror_unichar_ids ()
 
void punct_stripped (int *start_core, int *end_core) const
 
void GetNonSuperscriptSpan (int *start, int *end) const
 
WERD_CHOICE shallow_copy (int start, int end) const
 
void string_and_lengths (STRING *word_str, STRING *word_lengths_str) const
 
const STRING debug_string () const
 
bool ContainsAnyNonSpaceDelimited () const
 
bool IsAllSpaces () const
 
bool set_unichars_in_script_order (bool in_script_order)
 
bool unichars_in_script_order () const
 
const STRINGunichar_string () const
 
const STRINGunichar_lengths () const
 
void SetScriptPositions (bool small_caps, TWERD *word, int debug=0)
 
void SetScriptPositions (const tesseract::ScriptPos *positions, int length)
 
void SetAllScriptPositions (tesseract::ScriptPos position)
 
int GetTopScriptID () const
 
void UpdateStateForSplit (int blob_position)
 
int TotalOfStates () const
 
void print () const
 
void print (const char *msg) const
 
void print_state (const char *msg) const
 
void DisplaySegmentation (TWERD *word)
 
WERD_CHOICEoperator+= (const WERD_CHOICE &second)
 
WERD_CHOICEoperator= (const WERD_CHOICE &source)
 
- Public Member Functions inherited from ELIST_LINK
 ELIST_LINK ()
 
 ELIST_LINK (const ELIST_LINK &)
 
void operator= (const ELIST_LINK &)
 

Static Public Member Functions

static const char * permuter_name (uint8_t permuter)
 
static tesseract::ScriptPos ScriptPositionOf (bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
 

Static Public Attributes

static const float kBadRating = 100000.0
 

Detailed Description

Definition at line 263 of file ratngs.h.

Constructor & Destructor Documentation

◆ WERD_CHOICE() [1/5]

WERD_CHOICE::WERD_CHOICE ( const UNICHARSET unicharset)
inline

Definition at line 268 of file ratngs.h.

269  : unicharset_(unicharset) { this->init(8); }

◆ WERD_CHOICE() [2/5]

WERD_CHOICE::WERD_CHOICE ( const UNICHARSET unicharset,
int  reserved 
)
inline

Definition at line 270 of file ratngs.h.

271  : unicharset_(unicharset) { this->init(reserved); }

◆ WERD_CHOICE() [3/5]

WERD_CHOICE::WERD_CHOICE ( const char *  src_string,
const char *  src_lengths,
float  src_rating,
float  src_certainty,
uint8_t  src_permuter,
const UNICHARSET unicharset 
)
inline

Definition at line 272 of file ratngs.h.

278  : unicharset_(&unicharset) {
279  this->init(src_string, src_lengths, src_rating,
280  src_certainty, src_permuter);
281  }

◆ WERD_CHOICE() [4/5]

WERD_CHOICE::WERD_CHOICE ( const char *  src_string,
const UNICHARSET unicharset 
)

WERD_CHOICE::WERD_CHOICE

Constructor to build a WERD_CHOICE from the given string. The function assumes that src_string is not nullptr.

Definition at line 222 of file ratngs.cpp.

224  : unicharset_(&unicharset){
225  GenericVector<UNICHAR_ID> encoding;
226  GenericVector<char> lengths;
227  std::string cleaned = unicharset.CleanupString(src_string);
228  if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths,
229  nullptr)) {
230  lengths.push_back('\0');
231  STRING src_lengths = &lengths[0];
232  this->init(cleaned.c_str(), src_lengths.string(), 0.0, 0.0, NO_PERM);
233  } else { // There must have been an invalid unichar in the string.
234  this->init(8);
235  this->make_bad();
236  }
237 }

◆ WERD_CHOICE() [5/5]

WERD_CHOICE::WERD_CHOICE ( const WERD_CHOICE word)
inline

Definition at line 283 of file ratngs.h.

284  : ELIST_LINK(word), unicharset_(word.unicharset_) {
285  this->init(word.length());
286  this->operator=(word);
287  }

◆ ~WERD_CHOICE()

WERD_CHOICE::~WERD_CHOICE ( )

WERD_CHOICE::~WERD_CHOICE

Definition at line 280 of file ratngs.cpp.

280  {
281  delete[] unichar_ids_;
282  delete[] script_pos_;
283  delete[] state_;
284  delete[] certainties_;
285 }

Member Function Documentation

◆ adjust_factor()

float WERD_CHOICE::adjust_factor ( ) const
inline

Definition at line 296 of file ratngs.h.

296  {
297  return adjust_factor_;
298  }

◆ append_unichar_id()

void WERD_CHOICE::append_unichar_id ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty 
)

append_unichar_id

Make sure there is enough space in the word for the new unichar id and call append_unichar_id_space_allocated().

Definition at line 472 of file ratngs.cpp.

474  {
475  if (length_ == reserved_) {
476  this->double_the_size();
477  }
479  rating, certainty);
480 }

◆ append_unichar_id_space_allocated()

void WERD_CHOICE::append_unichar_id_space_allocated ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty 
)
inline

This function assumes that there is enough space reserved in the WERD_CHOICE for adding another unichar. This is an efficient alternative to append_unichar_id().

Definition at line 442 of file ratngs.h.

444  {
445  assert(reserved_ > length_);
446  length_++;
447  this->set_unichar_id(unichar_id, blob_count,
448  rating, certainty, length_-1);
449  }

◆ blob_choices()

BLOB_CHOICE_LIST * WERD_CHOICE::blob_choices ( int  index,
MATRIX ratings 
) const

Definition at line 294 of file ratngs.cpp.

294  {
295  MATRIX_COORD coord = MatrixCoord(index);
296  BLOB_CHOICE_LIST* result = ratings->get(coord.col, coord.row);
297  if (result == nullptr) {
298  result = new BLOB_CHOICE_LIST;
299  ratings->put(coord.col, coord.row, result);
300  }
301  return result;
302 }

◆ BlobPosition()

tesseract::ScriptPos WERD_CHOICE::BlobPosition ( int  index) const
inline

Definition at line 312 of file ratngs.h.

312  {
313  if (index < 0 || index >= length_)
314  return tesseract::SP_NORMAL;
315  return script_pos_[index];
316  }

◆ certainty() [1/2]

float WERD_CHOICE::certainty ( ) const
inline

Definition at line 320 of file ratngs.h.

320  {
321  return certainty_;
322  }

◆ certainty() [2/2]

float WERD_CHOICE::certainty ( int  index) const
inline

Definition at line 323 of file ratngs.h.

323  {
324  return certainties_[index];
325  }

◆ contains_unichar_id()

bool WERD_CHOICE::contains_unichar_id ( UNICHAR_ID  unichar_id) const

contains_unichar_id

Returns true if unichar_ids_ contain the given unichar_id, false otherwise.

Definition at line 330 of file ratngs.cpp.

330  {
331  for (int i = 0; i < length_; ++i) {
332  if (unichar_ids_[i] == unichar_id) {
333  return true;
334  }
335  }
336  return false;
337 }

◆ ContainsAnyNonSpaceDelimited()

bool WERD_CHOICE::ContainsAnyNonSpaceDelimited ( ) const
inline

Definition at line 504 of file ratngs.h.

504  {
505  for (int i = 0; i < length_; ++i) {
506  if (!unicharset_->IsSpaceDelimited(unichar_ids_[i])) return true;
507  }
508  return false;
509  }

◆ dangerous_ambig_found()

bool WERD_CHOICE::dangerous_ambig_found ( ) const
inline

Definition at line 353 of file ratngs.h.

353  {
354  return dangerous_ambig_found_;
355  }

◆ debug_string()

const STRING WERD_CHOICE::debug_string ( ) const
inline

Definition at line 495 of file ratngs.h.

495  {
496  STRING word_str;
497  for (int i = 0; i < length_; ++i) {
498  word_str += unicharset_->debug_str(unichar_ids_[i]);
499  word_str += " ";
500  }
501  return word_str;
502  }

◆ DisplaySegmentation()

void WERD_CHOICE::DisplaySegmentation ( TWERD word)

Definition at line 765 of file ratngs.cpp.

765  {
766 #ifndef GRAPHICS_DISABLED
767  // Number of different colors to draw with.
768  const int kNumColors = 6;
769  static ScrollView *segm_window = nullptr;
770  // Check the state against the static prev_drawn_state.
771  static GenericVector<int> prev_drawn_state;
772  bool already_done = prev_drawn_state.size() == length_;
773  if (!already_done) prev_drawn_state.init_to_size(length_, 0);
774  for (int i = 0; i < length_; ++i) {
775  if (prev_drawn_state[i] != state_[i]) {
776  already_done = false;
777  }
778  prev_drawn_state[i] = state_[i];
779  }
780  if (already_done || word->blobs.empty()) return;
781 
782  // Create the window if needed.
783  if (segm_window == nullptr) {
784  segm_window = new ScrollView("Segmentation", 5, 10, 500, 256,
785  2000.0, 256.0, true);
786  } else {
787  segm_window->Clear();
788  }
789 
790  TBOX bbox;
791  int blob_index = 0;
792  for (int c = 0; c < length_; ++c) {
793  auto color =
794  static_cast<ScrollView::Color>(c % kNumColors + 3);
795  for (int i = 0; i < state_[c]; ++i, ++blob_index) {
796  TBLOB* blob = word->blobs[blob_index];
797  bbox += blob->bounding_box();
798  blob->plot(segm_window, color, color);
799  }
800  }
801  segm_window->ZoomToRectangle(bbox.left(), bbox.top(),
802  bbox.right(), bbox.bottom());
803  segm_window->Update();
804  window_wait(segm_window);
805 #endif
806 }

◆ double_the_size()

void WERD_CHOICE::double_the_size ( )
inline

Make more space in unichar_id_ and fragment_lengths_ arrays.

Definition at line 377 of file ratngs.h.

377  {
378  if (reserved_ > 0) {
380  reserved_, unichar_ids_);
382  reserved_, script_pos_);
384  reserved_, state_);
386  reserved_, certainties_);
387  reserved_ *= 2;
388  } else {
389  unichar_ids_ = new UNICHAR_ID[1];
390  script_pos_ = new tesseract::ScriptPos[1];
391  state_ = new int[1];
392  certainties_ = new float[1];
393  reserved_ = 1;
394  }
395  }

◆ GetNonSuperscriptSpan()

void WERD_CHOICE::GetNonSuperscriptSpan ( int *  start,
int *  end 
) const

Definition at line 401 of file ratngs.cpp.

401  {
402  int end = length();
403  while (end > 0 &&
404  unicharset_->get_isdigit(unichar_ids_[end - 1]) &&
406  end--;
407  }
408  int start = 0;
409  while (start < end &&
410  unicharset_->get_isdigit(unichar_ids_[start]) &&
412  start++;
413  }
414  *pstart = start;
415  *pend = end;
416 }

◆ GetTopScriptID()

int WERD_CHOICE::GetTopScriptID ( ) const

Definition at line 671 of file ratngs.cpp.

671  {
672  int max_script = unicharset_->get_script_table_size();
673  int *sid = new int[max_script];
674  int x;
675  for (x = 0; x < max_script; x++) sid[x] = 0;
676  for (x = 0; x < length_; ++x) {
677  int script_id = unicharset_->get_script(unichar_id(x));
678  sid[script_id]++;
679  }
680  if (unicharset_->han_sid() != unicharset_->null_sid()) {
681  // Add the Hiragana & Katakana counts to Han and zero them out.
682  if (unicharset_->hiragana_sid() != unicharset_->null_sid()) {
683  sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()];
684  sid[unicharset_->hiragana_sid()] = 0;
685  }
686  if (unicharset_->katakana_sid() != unicharset_->null_sid()) {
687  sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()];
688  sid[unicharset_->katakana_sid()] = 0;
689  }
690  }
691  // Note that high script ID overrides lower one on a tie, thus biasing
692  // towards non-Common script (if sorted that way in unicharset file).
693  int max_sid = 0;
694  for (x = 1; x < max_script; x++)
695  if (sid[x] >= sid[max_sid]) max_sid = x;
696  if (sid[max_sid] < length_ / 2)
697  max_sid = unicharset_->null_sid();
698  delete[] sid;
699  return max_sid;
700 }

◆ has_rtl_unichar_id()

bool WERD_CHOICE::has_rtl_unichar_id ( ) const

has_rtl_unichar_id

Returns true if unichar_ids contain at least one "strongly" RTL unichar.

Definition at line 435 of file ratngs.cpp.

435  {
436  int i;
437  for (i = 0; i < length_; ++i) {
438  UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
439  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
441  return true;
442  }
443  }
444  return false;
445 }

◆ init() [1/2]

void WERD_CHOICE::init ( const char *  src_string,
const char *  src_lengths,
float  src_rating,
float  src_certainty,
uint8_t  src_permuter 
)

Helper function to build a WERD_CHOICE from the given string, fragment lengths, rating, certainty and permuter. The function assumes that src_string is not nullptr. src_lengths argument could be nullptr, in which case the unichars in src_string are assumed to all be of length 1.

WERD_CHOICE::init

Helper function to build a WERD_CHOICE from the given string, fragment lengths, rating, certainty and permuter.

The function assumes that src_string is not nullptr. src_lengths argument could be nullptr, in which case the unichars in src_string are assumed to all be of length 1.

Definition at line 249 of file ratngs.cpp.

253  {
254  int src_string_len = strlen(src_string);
255  if (src_string_len == 0) {
256  this->init(8);
257  } else {
258  this->init(src_lengths ? strlen(src_lengths): src_string_len);
259  length_ = reserved_;
260  int offset = 0;
261  for (int i = 0; i < length_; ++i) {
262  int unichar_length = src_lengths ? src_lengths[i] : 1;
263  unichar_ids_[i] =
264  unicharset_->unichar_to_id(src_string+offset, unichar_length);
265  state_[i] = 1;
266  certainties_[i] = src_certainty;
267  offset += unichar_length;
268  }
269  }
270  adjust_factor_ = 1.0f;
271  rating_ = src_rating;
272  certainty_ = src_certainty;
273  permuter_ = src_permuter;
274  dangerous_ambig_found_ = false;
275 }

◆ init() [2/2]

void WERD_CHOICE::init ( int  reserved)
inline

Initializes WERD_CHOICE - reserves length slots in unichar_ids_ and fragment_length_ arrays. Sets other values to default (blank) values.

Definition at line 399 of file ratngs.h.

399  {
400  reserved_ = reserved;
401  if (reserved > 0) {
402  unichar_ids_ = new UNICHAR_ID[reserved];
403  script_pos_ = new tesseract::ScriptPos[reserved];
404  state_ = new int[reserved];
405  certainties_ = new float[reserved];
406  } else {
407  unichar_ids_ = nullptr;
408  script_pos_ = nullptr;
409  state_ = nullptr;
410  certainties_ = nullptr;
411  }
412  length_ = 0;
413  adjust_factor_ = 1.0f;
414  rating_ = 0.0;
415  certainty_ = FLT_MAX;
416  min_x_height_ = 0.0f;
417  max_x_height_ = FLT_MAX;
418  permuter_ = NO_PERM;
419  unichars_in_script_order_ = false; // Tesseract is strict left-to-right.
420  dangerous_ambig_found_ = false;
421  }

◆ IsAllSpaces()

bool WERD_CHOICE::IsAllSpaces ( ) const
inline

Definition at line 511 of file ratngs.h.

511  {
512  for (int i = 0; i < length_; ++i) {
513  if (unichar_ids_[i] != UNICHAR_SPACE) return false;
514  }
515  return true;
516  }

◆ length()

int WERD_CHOICE::length ( ) const
inline

Definition at line 293 of file ratngs.h.

293  {
294  return length_;
295  }

◆ make_bad()

void WERD_CHOICE::make_bad ( )
inline

Set the fields in this choice to be default (bad) values.

Definition at line 433 of file ratngs.h.

433  {
434  length_ = 0;
435  rating_ = kBadRating;
436  certainty_ = -FLT_MAX;
437  }

◆ MatrixCoord()

MATRIX_COORD WERD_CHOICE::MatrixCoord ( int  index) const

Definition at line 306 of file ratngs.cpp.

306  {
307  int col = 0;
308  for (int i = 0; i < index; ++i)
309  col += state_[i];
310  int row = col + state_[index] - 1;
311  return MATRIX_COORD(col, row);
312 }

◆ max_x_height()

float WERD_CHOICE::max_x_height ( ) const
inline

Definition at line 329 of file ratngs.h.

329  {
330  return max_x_height_;
331  }

◆ min_x_height()

float WERD_CHOICE::min_x_height ( ) const
inline

Definition at line 326 of file ratngs.h.

326  {
327  return min_x_height_;
328  }

◆ operator+=()

WERD_CHOICE & WERD_CHOICE::operator+= ( const WERD_CHOICE second)

WERD_CHOICE::operator+=

Cat a second word rating on the end of this current one. The ratings are added and the confidence is the min. If the permuters are NOT the same the permuter is set to COMPOUND_PERM

Definition at line 489 of file ratngs.cpp.

489  {
490  ASSERT_HOST(unicharset_ == second.unicharset_);
491  while (reserved_ < length_ + second.length()) {
492  this->double_the_size();
493  }
494  const UNICHAR_ID *other_unichar_ids = second.unichar_ids();
495  for (int i = 0; i < second.length(); ++i) {
496  unichar_ids_[length_ + i] = other_unichar_ids[i];
497  state_[length_ + i] = second.state_[i];
498  certainties_[length_ + i] = second.certainties_[i];
499  script_pos_[length_ + i] = second.BlobPosition(i);
500  }
501  length_ += second.length();
502  if (second.adjust_factor_ > adjust_factor_)
503  adjust_factor_ = second.adjust_factor_;
504  rating_ += second.rating(); // add ratings
505  if (second.certainty() < certainty_) // take min
506  certainty_ = second.certainty();
507  if (second.dangerous_ambig_found_)
508  dangerous_ambig_found_ = true;
509  if (permuter_ == NO_PERM) {
510  permuter_ = second.permuter();
511  } else if (second.permuter() != NO_PERM &&
512  second.permuter() != permuter_) {
513  permuter_ = COMPOUND_PERM;
514  }
515  return *this;
516 }

◆ operator=()

WERD_CHOICE & WERD_CHOICE::operator= ( const WERD_CHOICE source)

WERD_CHOICE::operator=

Allocate enough memory to hold a copy of source and copy over all the information from source to this WERD_CHOICE.

Definition at line 525 of file ratngs.cpp.

525  {
526  while (reserved_ < source.length()) {
527  this->double_the_size();
528  }
529 
530  unicharset_ = source.unicharset_;
531  const UNICHAR_ID *other_unichar_ids = source.unichar_ids();
532  for (int i = 0; i < source.length(); ++i) {
533  unichar_ids_[i] = other_unichar_ids[i];
534  state_[i] = source.state_[i];
535  certainties_[i] = source.certainties_[i];
536  script_pos_[i] = source.BlobPosition(i);
537  }
538  length_ = source.length();
539  adjust_factor_ = source.adjust_factor_;
540  rating_ = source.rating();
541  certainty_ = source.certainty();
542  min_x_height_ = source.min_x_height();
543  max_x_height_ = source.max_x_height();
544  permuter_ = source.permuter();
545  dangerous_ambig_found_ = source.dangerous_ambig_found_;
546  return *this;
547 }

◆ permuter()

uint8_t WERD_CHOICE::permuter ( ) const
inline

Definition at line 336 of file ratngs.h.

336  {
337  return permuter_;
338  }

◆ permuter_name() [1/2]

const char * WERD_CHOICE::permuter_name ( ) const

Definition at line 287 of file ratngs.cpp.

287  {
288  return kPermuterTypeNames[permuter_];
289 }

◆ permuter_name() [2/2]

const char * WERD_CHOICE::permuter_name ( uint8_t  permuter)
static

Definition at line 198 of file ratngs.cpp.

198  {
199  return kPermuterTypeNames[permuter];
200 }

◆ print() [1/2]

void WERD_CHOICE::print ( ) const
inline

Definition at line 570 of file ratngs.h.

570 { this->print(""); }

◆ print() [2/2]

void WERD_CHOICE::print ( const char *  msg) const

WERD_CHOICE::print

Print WERD_CHOICE to stdout.

Definition at line 728 of file ratngs.cpp.

728  {
729  tprintf("%s : ", msg);
730  for (int i = 0; i < length_; ++i) {
731  tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
732  }
733  tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n",
734  rating_, certainty_, adjust_factor_, permuter_,
735  min_x_height_, max_x_height_, dangerous_ambig_found_);
736  tprintf("pos");
737  for (int i = 0; i < length_; ++i) {
738  tprintf("\t%s", ScriptPosToString(script_pos_[i]));
739  }
740  tprintf("\nstr");
741  for (int i = 0; i < length_; ++i) {
742  tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
743  }
744  tprintf("\nstate:");
745  for (int i = 0; i < length_; ++i) {
746  tprintf("\t%d ", state_[i]);
747  }
748  tprintf("\nC");
749  for (int i = 0; i < length_; ++i) {
750  tprintf("\t%.3f", certainties_[i]);
751  }
752  tprintf("\n");
753 }

◆ print_state()

void WERD_CHOICE::print_state ( const char *  msg) const

Definition at line 756 of file ratngs.cpp.

756  {
757  tprintf("%s", msg);
758  for (int i = 0; i < length_; ++i)
759  tprintf(" %d", state_[i]);
760  tprintf("\n");
761 }

◆ punct_stripped()

void WERD_CHOICE::punct_stripped ( int *  start,
int *  end 
) const

punct_stripped

Returns the half-open interval of unichar_id indices [start, end) which enclose the core portion of this word – the part after stripping punctuation from the left and right.

Definition at line 387 of file ratngs.cpp.

387  {
388  *start = 0;
389  *end = length() - 1;
390  while (*start < length() &&
391  unicharset()->get_ispunctuation(unichar_id(*start))) {
392  (*start)++;
393  }
394  while (*end > -1 &&
395  unicharset()->get_ispunctuation(unichar_id(*end))) {
396  (*end)--;
397  }
398  (*end)++;
399 }

◆ rating()

float WERD_CHOICE::rating ( ) const
inline

Definition at line 317 of file ratngs.h.

317  {
318  return rating_;
319  }

◆ remove_last_unichar_id()

void WERD_CHOICE::remove_last_unichar_id ( )
inline

Definition at line 473 of file ratngs.h.

473 { --length_; }

◆ remove_unichar_id()

void WERD_CHOICE::remove_unichar_id ( int  index)
inline

Definition at line 474 of file ratngs.h.

474  {
475  this->remove_unichar_ids(index, 1);
476  }

◆ remove_unichar_ids()

void WERD_CHOICE::remove_unichar_ids ( int  start,
int  num 
)

remove_unichar_ids

Removes num unichar ids starting from index start from unichar_ids_ and updates length_ and fragment_lengths_ to reflect this change. Note: this function does not modify rating_ and certainty_.

Definition at line 346 of file ratngs.cpp.

346  {
347  ASSERT_HOST(start >= 0 && start + num <= length_);
348  // Accumulate the states to account for the merged blobs.
349  for (int i = 0; i < num; ++i) {
350  if (start > 0)
351  state_[start - 1] += state_[start + i];
352  else if (start + num < length_)
353  state_[start + num] += state_[start + i];
354  }
355  for (int i = start; i + num < length_; ++i) {
356  unichar_ids_[i] = unichar_ids_[i + num];
357  script_pos_[i] = script_pos_[i + num];
358  state_[i] = state_[i + num];
359  certainties_[i] = certainties_[i + num];
360  }
361  length_ -= num;
362 }

◆ reverse_and_mirror_unichar_ids()

void WERD_CHOICE::reverse_and_mirror_unichar_ids ( )

reverse_and_mirror_unichar_ids

Reverses and mirrors unichars in unichar_ids.

Definition at line 369 of file ratngs.cpp.

369  {
370  for (int i = 0; i < length_ / 2; ++i) {
371  UNICHAR_ID tmp_id = unichar_ids_[i];
372  unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_-1-i]);
373  unichar_ids_[length_-1-i] = unicharset_->get_mirror(tmp_id);
374  }
375  if (length_ % 2 != 0) {
376  unichar_ids_[length_/2] = unicharset_->get_mirror(unichar_ids_[length_/2]);
377  }
378 }

◆ ScriptPositionOf()

ScriptPos WERD_CHOICE::ScriptPositionOf ( bool  print_debug,
const UNICHARSET unicharset,
const TBOX blob_box,
UNICHAR_ID  unichar_id 
)
static

Definition at line 633 of file ratngs.cpp.

636  {
638  int top = blob_box.top();
639  int bottom = blob_box.bottom();
640  int min_bottom, max_bottom, min_top, max_top;
642  &min_bottom, &max_bottom,
643  &min_top, &max_top);
644 
645  int sub_thresh_top = min_top - kMinSubscriptOffset;
646  int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;
647  int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;
648  if (bottom <= kMaxDropCapBottom) {
649  retval = tesseract::SP_DROPCAP;
650  } else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
651  retval = tesseract::SP_SUBSCRIPT;
652  } else if (bottom > sup_thresh_bot) {
653  retval = tesseract::SP_SUPERSCRIPT;
654  }
655 
656  if (print_debug) {
657  const char *pos = ScriptPosToString(retval);
658  tprintf("%s Character %s[bot:%d top: %d] "
659  "bot_range[%d,%d] top_range[%d, %d] "
660  "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n",
662  bottom, top,
663  min_bottom, max_bottom, min_top, max_top,
664  sub_thresh_bot, sub_thresh_top,
665  sup_thresh_bot);
666  }
667  return retval;
668 }

◆ set_adjust_factor()

void WERD_CHOICE::set_adjust_factor ( float  factor)
inline

Definition at line 299 of file ratngs.h.

299  {
300  adjust_factor_ = factor;
301  }

◆ set_blob_choice()

void WERD_CHOICE::set_blob_choice ( int  index,
int  blob_count,
const BLOB_CHOICE blob_choice 
)

Definition at line 316 of file ratngs.cpp.

317  {
318  unichar_ids_[index] = blob_choice->unichar_id();
319  script_pos_[index] = tesseract::SP_NORMAL;
320  state_[index] = blob_count;
321  certainties_[index] = blob_choice->certainty();
322 }

◆ set_certainty()

void WERD_CHOICE::set_certainty ( float  new_val)
inline

Definition at line 362 of file ratngs.h.

362  {
363  certainty_ = new_val;
364  }

◆ set_dangerous_ambig_found_()

void WERD_CHOICE::set_dangerous_ambig_found_ ( bool  value)
inline

Definition at line 356 of file ratngs.h.

356  {
357  dangerous_ambig_found_ = value;
358  }

◆ set_length()

void WERD_CHOICE::set_length ( int  len)
inline

Definition at line 371 of file ratngs.h.

371  {
372  ASSERT_HOST(reserved_ >= len);
373  length_ = len;
374  }

◆ set_permuter()

void WERD_CHOICE::set_permuter ( uint8_t  perm)
inline

Definition at line 365 of file ratngs.h.

365  {
366  permuter_ = perm;
367  }

◆ set_rating()

void WERD_CHOICE::set_rating ( float  new_val)
inline

Definition at line 359 of file ratngs.h.

359  {
360  rating_ = new_val;
361  }

◆ set_unichar_id() [1/2]

void WERD_CHOICE::set_unichar_id ( UNICHAR_ID  unichar_id,
int  blob_count,
float  rating,
float  certainty,
int  index 
)
inline

Definition at line 454 of file ratngs.h.

455  {
456  assert(index < length_);
457  unichar_ids_[index] = unichar_id;
458  state_[index] = blob_count;
459  certainties_[index] = certainty;
460  script_pos_[index] = tesseract::SP_NORMAL;
461  rating_ += rating;
462  if (certainty < certainty_) {
463  certainty_ = certainty;
464  }
465  }

◆ set_unichar_id() [2/2]

void WERD_CHOICE::set_unichar_id ( UNICHAR_ID  unichar_id,
int  index 
)
inline

Definition at line 349 of file ratngs.h.

349  {
350  assert(index < length_);
351  unichar_ids_[index] = unichar_id;
352  }

◆ set_unichars_in_script_order()

bool WERD_CHOICE::set_unichars_in_script_order ( bool  in_script_order)
inline

Definition at line 521 of file ratngs.h.

521  {
522  return unichars_in_script_order_ = in_script_order;
523  }

◆ set_x_heights()

void WERD_CHOICE::set_x_heights ( float  min_height,
float  max_height 
)
inline

Definition at line 332 of file ratngs.h.

332  {
333  min_x_height_ = min_height;
334  max_x_height_ = max_height;
335  }

◆ SetAllScriptPositions()

void WERD_CHOICE::SetAllScriptPositions ( tesseract::ScriptPos  position)

Definition at line 627 of file ratngs.cpp.

627  {
628  for (int i = 0; i < length_; ++i)
629  script_pos_[i] = position;
630 }

◆ SetScriptPositions() [1/2]

void WERD_CHOICE::SetScriptPositions ( bool  small_caps,
TWERD word,
int  debug = 0 
)

Definition at line 554 of file ratngs.cpp.

554  {
555  // Initialize to normal.
556  for (int i = 0; i < length_; ++i)
557  script_pos_[i] = tesseract::SP_NORMAL;
558  if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) {
559  return;
560  }
561 
562  int position_counts[4] = { 0, 0, 0, 0 };
563 
564  int chunk_index = 0;
565  for (int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
566  TBLOB* tblob = word->blobs[chunk_index];
567  int uni_id = unichar_id(blob_index);
568  TBOX blob_box = tblob->bounding_box();
569  if (state_ != nullptr) {
570  for (int i = 1; i < state_[blob_index]; ++i) {
571  ++chunk_index;
572  tblob = word->blobs[chunk_index];
573  blob_box += tblob->bounding_box();
574  }
575  }
576  script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box,
577  uni_id);
578  if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {
579  script_pos_[blob_index] = tesseract::SP_NORMAL;
580  }
581  position_counts[script_pos_[blob_index]]++;
582  }
583  // If almost everything looks like a superscript or subscript,
584  // we most likely just got the baseline wrong.
585  if (position_counts[tesseract::SP_SUBSCRIPT] > 0.75 * length_ ||
586  position_counts[tesseract::SP_SUPERSCRIPT] > 0.75 * length_) {
587  if (debug >= 2) {
588  tprintf("Most characters of %s are subscript or superscript.\n"
589  "That seems wrong, so I'll assume we got the baseline wrong\n",
590  unichar_string().string());
591  }
592  for (int i = 0; i < length_; i++) {
593  ScriptPos sp = script_pos_[i];
595  position_counts[sp]--;
596  position_counts[tesseract::SP_NORMAL]++;
597  script_pos_[i] = tesseract::SP_NORMAL;
598  }
599  }
600  }
601 
602  if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) ||
603  debug >= 2) {
604  tprintf("SetScriptPosition on %s\n", unichar_string().string());
605  int chunk_index = 0;
606  for (int blob_index = 0; blob_index < length_; ++blob_index) {
607  if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
608  TBLOB* tblob = word->blobs[chunk_index];
609  ScriptPositionOf(true, *unicharset_, tblob->bounding_box(),
610  unichar_id(blob_index));
611  }
612  chunk_index += state_ != nullptr ? state_[blob_index] : 1;
613  }
614  }
615 }

◆ SetScriptPositions() [2/2]

void WERD_CHOICE::SetScriptPositions ( const tesseract::ScriptPos positions,
int  length 
)

Definition at line 617 of file ratngs.cpp.

618  {
619  ASSERT_HOST(length == length_);
620  if (positions != script_pos_) {
621  delete [] script_pos_;
622  script_pos_ = new ScriptPos[length];
623  memcpy(script_pos_, positions, sizeof(positions[0]) * length);
624  }
625 }

◆ shallow_copy()

WERD_CHOICE WERD_CHOICE::shallow_copy ( int  start,
int  end 
) const

Definition at line 418 of file ratngs.cpp.

418  {
419  ASSERT_HOST(start >= 0 && start <= length_);
420  ASSERT_HOST(end >= 0 && end <= length_);
421  if (end < start) { end = start; }
422  WERD_CHOICE retval(unicharset_, end - start);
423  for (int i = start; i < end; i++) {
424  retval.append_unichar_id_space_allocated(
425  unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
426  }
427  return retval;
428 }

◆ state()

int WERD_CHOICE::state ( int  index) const
inline

Definition at line 309 of file ratngs.h.

309  {
310  return state_[index];
311  }

◆ string_and_lengths()

void WERD_CHOICE::string_and_lengths ( STRING word_str,
STRING word_lengths_str 
) const

string_and_lengths

Populates the given word_str with unichars from unichar_ids and and word_lengths_str with the corresponding unichar lengths.

Definition at line 453 of file ratngs.cpp.

454  {
455  *word_str = "";
456  if (word_lengths_str != nullptr) *word_lengths_str = "";
457  for (int i = 0; i < length_; ++i) {
458  const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
459  *word_str += ch;
460  if (word_lengths_str != nullptr) {
461  *word_lengths_str += strlen(ch);
462  }
463  }
464 }

◆ TotalOfStates()

int WERD_CHOICE::TotalOfStates ( ) const

Definition at line 715 of file ratngs.cpp.

715  {
716  int total_chunks = 0;
717  for (int i = 0; i < length_; ++i) {
718  total_chunks += state_[i];
719  }
720  return total_chunks;
721 }

◆ unichar_id()

UNICHAR_ID WERD_CHOICE::unichar_id ( int  index) const
inline

Definition at line 305 of file ratngs.h.

305  {
306  assert(index < length_);
307  return unichar_ids_[index];
308  }

◆ unichar_ids()

const UNICHAR_ID* WERD_CHOICE::unichar_ids ( ) const
inline

Definition at line 302 of file ratngs.h.

302  {
303  return unichar_ids_;
304  }

◆ unichar_lengths()

const STRING& WERD_CHOICE::unichar_lengths ( ) const
inline

Definition at line 538 of file ratngs.h.

538  {
539  this->string_and_lengths(&unichar_string_, &unichar_lengths_);
540  return unichar_lengths_;
541  }

◆ unichar_string()

const STRING& WERD_CHOICE::unichar_string ( ) const
inline

Definition at line 531 of file ratngs.h.

531  {
532  this->string_and_lengths(&unichar_string_, &unichar_lengths_);
533  return unichar_string_;
534  }

◆ unichars_in_script_order()

bool WERD_CHOICE::unichars_in_script_order ( ) const
inline

Definition at line 525 of file ratngs.h.

525  {
526  return unichars_in_script_order_;
527  }

◆ unicharset()

const UNICHARSET* WERD_CHOICE::unicharset ( ) const
inline

Definition at line 290 of file ratngs.h.

290  {
291  return unicharset_;
292  }

◆ UpdateStateForSplit()

void WERD_CHOICE::UpdateStateForSplit ( int  blob_position)

Definition at line 703 of file ratngs.cpp.

703  {
704  int total_chunks = 0;
705  for (int i = 0; i < length_; ++i) {
706  total_chunks += state_[i];
707  if (total_chunks > blob_position) {
708  ++state_[i];
709  return;
710  }
711  }
712 }

Member Data Documentation

◆ kBadRating

const float WERD_CHOICE::kBadRating = 100000.0
static

Definition at line 265 of file ratngs.h.


The documentation for this class was generated from the following files:
GenericVector::init_to_size
void init_to_size(int size, const T &t)
Definition: genericvector.h:744
TBOX
Definition: rect.h:34
TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:448
UNICHARSET::get_script_table_size
int get_script_table_size() const
Definition: unicharset.h:849
WERD_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:349
UNICHARSET::CleanupString
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:246
STRING::string
const char * string() const
Definition: strngs.cpp:194
MATRIX_COORD::col
int col
Definition: matrix.h:636
WERD_CHOICE::BlobPosition
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:312
BLOB_CHOICE::unichar_id
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
MATRIX_COORD
Definition: matrix.h:608
kMaxDropCapBottom
const int kMaxDropCapBottom
Definition: ratngs.cpp:47
UNICHARSET::Direction
Direction
Definition: unicharset.h:156
WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:531
WERD_CHOICE::MatrixCoord
MATRIX_COORD MatrixCoord(int index) const
Definition: ratngs.cpp:306
UNICHARSET::encode_string
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:259
NO_PERM
@ NO_PERM
Definition: ratngs.h:233
TBOX::right
int16_t right() const
Definition: rect.h:79
UNICHARSET::IsSpaceDelimited
bool IsSpaceDelimited(UNICHAR_ID unichar_id) const
Definition: unicharset.h:652
UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
ELIST_LINK::ELIST_LINK
ELIST_LINK()
Definition: elst.h:130
MATRIX_COORD::row
int row
Definition: matrix.h:637
WERD_CHOICE::remove_unichar_ids
void remove_unichar_ids(int index, int num)
Definition: ratngs.cpp:346
WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:317
UNICHARSET::han_sid
int han_sid() const
Definition: unicharset.h:889
TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
UNICHARSET::U_RIGHT_TO_LEFT_ARABIC
@ U_RIGHT_TO_LEFT_ARABIC
Definition: unicharset.h:170
kMinSuperscriptOffset
const int kMinSuperscriptOffset
Definition: ratngs.cpp:45
window_wait
char window_wait(ScrollView *win)
Definition: callcpp.cpp:103
UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
GenericVector< UNICHAR_ID >
UNICHARSET::debug_str
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:343
UNICHARSET::get_top_bottom
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:568
tesseract::SP_SUBSCRIPT
@ SP_SUBSCRIPT
Definition: ratngs.h:254
UNICHARSET::katakana_sid
int katakana_sid() const
Definition: unicharset.h:891
TBOX::left
int16_t left() const
Definition: rect.h:72
UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:34
WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
tprintf
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
COMPOUND_PERM
@ COMPOUND_PERM
Definition: ratngs.h:245
WERD_CHOICE::operator=
WERD_CHOICE & operator=(const WERD_CHOICE &source)
Definition: ratngs.cpp:525
ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:88
ScrollView
Definition: scrollview.h:98
ScrollView::ZoomToRectangle
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:757
ScrollView::Update
static void Update()
Definition: scrollview.cpp:709
tesseract::ScriptPos
ScriptPos
Definition: ratngs.h:252
WERD_CHOICE::ScriptPositionOf
static tesseract::ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
Definition: ratngs.cpp:633
UNICHARSET::get_script
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:663
TBOX::top
int16_t top() const
Definition: rect.h:58
ScrollView::Color
Color
Definition: scrollview.h:101
tesseract::SP_NORMAL
@ SP_NORMAL
Definition: ratngs.h:253
GENERIC_2D_ARRAY::put
void put(ICOORD pos, const T &thing)
Definition: matrix.h:223
WERD_CHOICE::unichar_ids
const UNICHAR_ID * unichar_ids() const
Definition: ratngs.h:302
UNICHARSET::hiragana_sid
int hiragana_sid() const
Definition: unicharset.h:890
WERD_CHOICE::max_x_height
float max_x_height() const
Definition: ratngs.h:329
WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:320
kBlnBaselineOffset
const int kBlnBaselineOffset
Definition: normalis.h:25
WERD_CHOICE::min_x_height
float min_x_height() const
Definition: ratngs.h:326
WERD_CHOICE::unicharset
const UNICHARSET * unicharset() const
Definition: ratngs.h:290
GenericVector::size
int size() const
Definition: genericvector.h:72
TBLOB
Definition: blobs.h:284
tesseract::SP_DROPCAP
@ SP_DROPCAP
Definition: ratngs.h:256
WERD_CHOICE::make_bad
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:433
WERD_CHOICE::double_the_size
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
Definition: ratngs.h:377
UNICHARSET::null_sid
int null_sid() const
Definition: unicharset.h:884
WERD_CHOICE::append_unichar_id_space_allocated
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:442
TBLOB::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:468
WERD_CHOICE::kBadRating
static const float kBadRating
Definition: ratngs.h:265
GenericVector::double_the_size_memcpy
static T * double_the_size_memcpy(int current_size, T *data)
Definition: genericvector.h:213
kMinSubscriptOffset
const int kMinSubscriptOffset
Definition: ratngs.cpp:43
tesseract::SP_SUPERSCRIPT
@ SP_SUPERSCRIPT
Definition: ratngs.h:255
UNICHARSET::get_direction
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:690
WERD_CHOICE::permuter
uint8_t permuter() const
Definition: ratngs.h:336
UNICHARSET::id_to_unichar_ext
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:299
WERD_CHOICE::length
int length() const
Definition: ratngs.h:293
BLOB_CHOICE::certainty
float certainty() const
Definition: ratngs.h:83
WERD_CHOICE::TotalOfStates
int TotalOfStates() const
Definition: ratngs.cpp:715
GENERIC_2D_ARRAY::get
T get(ICOORD pos) const
Definition: matrix.h:231
UNICHARSET::U_RIGHT_TO_LEFT
@ U_RIGHT_TO_LEFT
Definition: unicharset.h:158
WERD_CHOICE::string_and_lengths
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:453
STRING
Definition: strngs.h:45
ScrollView::Clear
void Clear()
Definition: scrollview.cpp:589
UNICHARSET::get_mirror
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:697
WERD_CHOICE::print
void print() const
Definition: ratngs.h:570
GenericVector::empty
bool empty() const
Definition: genericvector.h:91
TBLOB::plot
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
Definition: blobs.cpp:510
WERD_CHOICE
Definition: ratngs.h:263
UNICHAR_SPACE
@ UNICHAR_SPACE
Definition: unicharset.h:34
TBOX::bottom
int16_t bottom() const
Definition: rect.h:65
tesseract::ScriptPosToString
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:204
WERD_CHOICE::init
void init(int reserved)
Definition: ratngs.h:399
GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:837