20 #include "config_auto.h"
27 #include "allheaders.h"
142 static BOOL_VAR(textord_show_tables,
false,
"Show table regions");
143 static BOOL_VAR(textord_tablefind_show_mark,
false,
144 "Debug table marking steps in detail");
145 static BOOL_VAR(textord_tablefind_show_stats,
false,
146 "Show page stats used in table finding");
147 static BOOL_VAR(textord_tablefind_recognize_tables,
false,
148 "Enables the table recognizer for table layout and filtering.");
161 global_median_xheight_(0),
162 global_median_blob_width_(0),
163 global_median_ledding_(0),
164 left_to_right_language_(true) {
182 const ICOORD& top_right) {
221 BLOBNBOX_CLIST* part_boxes = part->
boxes();
222 BLOBNBOX_C_IT pit(part_boxes);
223 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
230 if (leader_part ==
nullptr) {
234 leader_part->
AddBox(pblob);
236 clean_part->
AddBox(pblob);
243 if (leader_part !=
nullptr) {
266 #ifndef GRAPHICS_DISABLED
267 if (textord_show_tables) {
275 table_win =
MakeWindow(100, 300,
"Fragmented Text");
285 ColSegment_LIST column_blocks;
299 ColSegment_LIST table_columns;
305 ColSegment_LIST table_regions;
308 #ifndef GRAPHICS_DISABLED
309 if (textord_tablefind_show_mark) {
326 if (textord_tablefind_recognize_tables) {
330 #ifndef GRAPHICS_DISABLED
331 if (textord_show_tables) {
344 #ifndef GRAPHICS_DISABLED
345 if (textord_show_tables) {
359 #ifndef GRAPHICS_DISABLED
360 if (textord_show_tables) {
439 if (part->
boxes()->empty()) {
449 bool found_split =
true;
450 while (found_split) {
452 BLOBNBOX_C_IT box_it(right_part->
boxes());
457 int previous_right = INT32_MIN;
460 for (box_it.mark_cycle_pt(); !box_it.cycled_list(); box_it.forward()) {
461 const TBOX& box = box_it.data()->bounding_box();
462 if (previous_right != INT32_MIN &&
463 box.
left() - previous_right > kThreshold) {
466 int mid_x = (box.
left() + previous_right) / 2;
468 right_part = left_part->
SplitAt(mid_x);
476 previous_right = std::max(previous_right,
static_cast<int>(box.
right()));
509 return box.
height() > kHeightRequired &&
510 box.
width() > kWidthRequired &&
511 box.
area() > kAreaRequired;
524 ColSegment_LIST* column_blocks) {
527 if (columns !=
nullptr) {
528 ColSegment_LIST new_blocks;
539 ColSegment_LIST* column_blocks) {
540 ColSegment_IT src_it(new_blocks);
541 ColSegment_IT dest_it(column_blocks);
543 for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) {
546 bool match_found =
false;
548 for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) {
556 delete src_it.extract();
562 dest_it.add_after_then_move(src_it.extract());
571 return (abs(b1.
left() - b2.
left()) < x_margin) &&
595 int y = part->
MidY();
600 int left_space = std::max(0, box.
left() - left_column->
LeftAtY(y));
605 int right_space = std::max(0, right_column->
RightAtY(y) - box.
right());
619 if (right < box.
left()) {
632 if (left > box.
right()) {
684 if (neighbor == part)
690 if (neighbor_box.
top() < part_box.
bottom() &&
691 gap < min_space_below) {
692 min_space_below = gap;
693 below_neighbor = neighbor;
695 else if (part_box.
top() < neighbor_box.
bottom() &&
696 gap < min_space_above) {
697 min_space_above = gap;
698 above_neighbor = neighbor;
731 BLOBNBOX_C_IT it(part->
boxes());
732 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
733 xheight_stats.
add(it.data()->bounding_box().height(), 1);
734 width_stats.
add(it.data()->bounding_box().width(), 1);
745 #ifndef GRAPHICS_DISABLED
746 if (textord_tablefind_show_stats) {
747 const char* kWindowName =
"X-height (R), X-width (G), and ledding (B)";
791 if (textord_tablefind_show_mark) {
798 if (textord_tablefind_show_mark) {
805 if (textord_tablefind_show_mark) {
812 if (textord_tablefind_show_mark || textord_show_tables) {
861 BLOBNBOX_CLIST* part_boxes = part->
boxes();
862 BLOBNBOX_C_IT it(part_boxes);
872 int previous_x1 = -1;
874 int largest_partition_gap_found = -1;
881 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
885 if (previous_x1 != -1) {
886 int gap = current_x0 - previous_x1;
898 previous_x1 = std::max(previous_x1, current_x1);
912 if (gap > largest_partition_gap_found)
913 largest_partition_gap_found = gap;
915 previous_x1 = current_x1;
929 if (largest_partition_gap_found == -1)
935 return largest_partition_gap_found < min_gap;
955 const int top = box.
top() + search_size;
956 const int bottom = box.
bottom() - search_size;
958 for (
int direction = 0; direction < 2; ++direction) {
959 bool right_to_left = (direction == 0);
960 int x = right_to_left ? box.
right() : box.
left();
963 while ((leader = hsearch.
NextSideSearch(right_to_left)) !=
nullptr) {
1022 int current_spacing = 0;
1023 int upper_spacing = 0;
1029 current_spacing = mid - left;
1030 upper_spacing = upper_mid - left;
1036 current_spacing = right - mid;
1037 upper_spacing = right - upper_mid;
1078 int max_top = INT32_MIN;
1079 int min_bottom = INT32_MAX;
1088 if (top > max_top) {
1092 if (bottom < min_bottom) {
1093 min_bottom = bottom;
1118 if (!upper_part || !lower_part)
1144 ColSegment_IT it(column_blocks);
1145 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1148 int num_table_cells = 0;
1149 int num_text_cells = 0;
1164 if (!num_table_cells && !num_text_cells) {
1165 delete it.extract();
1178 ColSegment_IT it(segments);
1179 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1206 bool neighbor_found =
false;
1207 bool modified =
false;
1213 int top_range = std::min(box.
top() + margin,
static_cast<int>(
tright().y()));
1214 int bottom_range = std::max(box.
bottom() - margin,
static_cast<int>(
bleft().y()));
1217 neighbor_found =
false;
1223 if (neighbor == seg)
1245 neighbor_found =
true;
1252 }
while (neighbor_found);
1274 ColSegment_IT it(table_columns);
1285 col->InsertBox(box);
1294 bool found_neighbours =
false;
1308 col->InsertBox(neighbor_box);
1310 found_neighbours =
true;
1312 if (found_neighbours) {
1313 it.add_after_then_move(col);
1324 ColSegment_LIST* table_regions) {
1325 ColSegment_IT cit(table_columns);
1326 ColSegment_IT rit(table_regions);
1335 bool* table_region =
new bool[page_height];
1339 for (
int i = 0; i < page_height; i++) {
1340 table_region[i] =
false;
1344 cit.move_to_first();
1345 for (cit.mark_cycle_pt(); !cit.cycled_list(); cit.forward()) {
1346 TBOX col_box = cit.data()->bounding_box();
1350 for (
int i = intersection_box.
bottom(); i < intersection_box.
top(); i++) {
1351 table_region[i -
bleft().
y()] =
true;
1355 TBOX current_table_box;
1360 for (
int i = 1; i < page_height; i++) {
1362 if (!table_region[i - 1] && table_region[i]) {
1367 if (table_region[i - 1] && !table_region[i]) {
1369 if (!current_table_box.
null_box()) {
1371 seg->InsertBox(current_table_box);
1372 rit.add_after_then_move(seg);
1377 delete[] table_region;
1393 bool neighbor_found =
false;
1394 bool modified =
false;
1398 TBOX search_region(box);
1401 neighbor_found =
false;
1407 if (neighbor == seg)
1425 neighbor_found =
true;
1432 }
while (neighbor_found);
1488 ColSegment_CLIST adjusted_tables;
1489 ColSegment_C_IT it(&adjusted_tables);
1495 TBOX grown_box = table_box;
1502 col->InsertBox(grown_box);
1503 it.add_after_then_move(col);
1514 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1529 TBOX search_box = table_box;
1543 const TBOX& search_range,
1547 for (
int i = 0; i < 2; ++i) {
1571 const TBOX& search_range,
1586 if (result_box->
contains(part_box))
1601 const TBOX& table_box) {
1619 int num_extra_partitions = 0;
1620 int extra_space_to_right = 0;
1621 int extra_space_to_left = 0;
1624 for (
int i = 0; i < 2; ++i) {
1641 num_extra_partitions++;
1645 extra_space_to_right++;
1646 extra_space_to_left++;
1651 extra_space_to_right++;
1653 extra_space_to_left++;
1658 return (extra_space_to_right > num_extra_partitions / 2) ||
1659 (extra_space_to_left > num_extra_partitions / 2);
1675 int table_top = table_box->
top();
1678 if (box.
bottom() - table_top > max_distance)
1684 previous_neighbor =
nullptr;
1689 if (previous_neighbor ==
nullptr) {
1690 previous_neighbor = neighbor;
1707 int* table_xprojection =
new int[page_width];
1716 for (
int i = 0; i < page_width; i++) {
1717 table_xprojection[i] = 0;
1734 BLOBNBOX_CLIST* part_boxes = part->
boxes();
1735 BLOBNBOX_C_IT pit(part_boxes);
1742 int next_position_to_write = 0;
1744 for (pit.mark_cycle_pt(); !pit.cycled_list(); pit.forward()) {
1751 xstart = std::max(xstart, next_position_to_write);
1752 for (
int i = xstart; i < xend; i++)
1753 table_xprojection[i -
bleft().
x()]++;
1754 next_position_to_write = xend;
1763 delete[] table_xprojection;
1771 for (
int i = 0; i < length; i++) {
1772 if (xprojection[i] > peak_value) {
1773 peak_value = xprojection[i];
1785 for (
int i = 0; i < length; i++) {
1786 xprojection[i] = (xprojection[i] >= projection_threshold) ? 1 : 0;
1789 int largest_gap = 0;
1791 for (
int i = 1; i < length; i++) {
1793 if (xprojection[i - 1] && !xprojection[i]) {
1797 if (run_start != -1 && !xprojection[i - 1] && xprojection[i]) {
1798 int gap = i - run_start;
1799 if (gap > largest_gap)
1819 if (textord_show_tables) {
1820 table_win =
MakeWindow(0, 0,
"Table Structure");
1835 ColSegment_CLIST good_tables;
1836 ColSegment_C_IT good_it(&good_tables);
1851 if (table_structure !=
nullptr) {
1852 if (textord_show_tables) {
1856 delete table_structure;
1857 good_it.add_after_then_move(found_table);
1866 for (good_it.mark_cycle_pt(); !good_it.cycled_list(); good_it.forward())
1872 ColSegment_LIST *segments,
1874 #ifndef GRAPHICS_DISABLED
1877 ColSegment_IT it(segments);
1878 for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
1881 int left_x = box.
left();
1882 int right_x = box.
right();
1883 int top_y = box.
top();
1884 int bottom_y = box.
bottom();
1885 win->
Rectangle(left_x, bottom_y, right_x, top_y);
1893 #ifndef GRAPHICS_DISABLED
1901 int left_x = box.
left();
1902 int right_x = box.
right();
1903 int top_y = box.
top();
1904 int bottom_y = box.
bottom();
1907 win->
Rectangle(left_x, bottom_y, right_x, top_y);
1920 #ifndef GRAPHICS_DISABLED
1928 color = default_color;
1930 color = table_color;
1933 int left_x = box.
left();
1934 int right_x = box.
right();
1935 int top_y = box.
top();
1936 int bottom_y = box.
bottom();
1939 win->
Rectangle(left_x, bottom_y, right_x, top_y);
1954 #ifndef GRAPHICS_DISABLED
1962 int left_x = box.
left();
1963 int right_x = box.
right();
1964 int top_y = box.
top();
1965 int bottom_y = box.
bottom();
1970 int mid_x = (left_x + right_x) / 2;
1971 int mid_y = (top_y + bottom_y) / 2;
1972 int other_x = (upper_box.
left() + upper_box.
right()) / 2;
1973 int other_y = (upper_box.
top() + upper_box.
bottom()) / 2;
1976 win->
Line(mid_x, mid_y, other_x, other_y);
1981 int mid_x = (left_x + right_x) / 2;
1982 int mid_y = (top_y + bottom_y) / 2;
1983 int other_x = (lower_box.
left() + lower_box.
right()) / 2;
1984 int other_y = (lower_box.
top() + lower_box.
bottom()) / 2;
1987 win->
Line(mid_x, mid_y, other_x, other_y);
2035 if (table_partition) {
2036 table_partition->
Absorb(part, width_cb);
2038 table_partition = part;
2043 if (table_partition) {
2055 grid->
InsertBBox(
true,
true, table_partition);
2064 num_table_cells_(0),
2077 return kBoxColors[type_];
2090 else if (num_text_cells_ > num_table_cells_)
CLISTIZE(BLOCK_RES) ELISTIZE(ROW_RES) ELISTIZE(WERD_RES) static const double kStopperAmbiguityThresholdGain
#define ELISTIZE(CLASSNAME)
#define BOOL_VAR(name, val, comment)
const int kMaxVerticalSpacing
const double kAllowBlobHeight
const double kMinOverlapWithTable
const double kMaxTableCellXheight
const double kMaxParagraphEndingLeftSpaceMultiple
const double kMinMaxGapInTextPartition
const double kLargeTableProjectionThreshold
const int kMinBoxesInTextPartition
const double kStrokeWidthFractionalTolerance
const int kMinRowsInTable
const double kMinParagraphEndingTextToWhitespaceRatio
const double kMaxGapInTextPartition
void DeleteObject(T *object)
const double kTableColumnThreshold
const double kAllowBlobArea
const double kAllowTextArea
const int kAdjacentLeaderSearchPadding
const double kStrokeWidthConstantTolerance
const double kAllowTextWidth
const double kAllowTextHeight
const double kAllowBlobWidth
const int kMaxColumnHeaderDistance
const int kMaxBoxesInDataPartition
const double kParagraphEndingPreviousLineRatio
const int kLargeTableRowCount
const double kSmallTableProjectionThreshold
const int kSideSpaceMargin
const double kSplitPartitionSize
const double kMaxBlobOverlapFactor
const double kMaxXProjectionGapFactor
BlobRegionType region_type() const
BlobTextFlowType flow() const
const TBOX & bounding_box() const
int16_t y() const
access_function
int16_t x() const
access function
double overlap_fraction(const TBOX &box) const
bool overlap(const TBOX &box) const
TBOX bounding_union(const TBOX &box) const
TBOX intersection(const TBOX &box) const
bool contains(const FCOORD pt) const
bool major_x_overlap(const TBOX &box) const
bool major_y_overlap(const TBOX &box) const
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
void add(int32_t value, int32_t count)
void StartVerticalSearch(int xmin, int xmax, int y)
void SetUniqueMode(bool mode)
BBC * NextSideSearch(bool right_to_left)
void StartSideSearch(int x, int ymin, int ymax)
BBC * NextVerticalSearch(bool top_to_bottom)
void RepositionIterator()
void StartRectSearch(const TBOX &rect)
const ICOORD & bleft() const
void GridCoords(int x, int y, int *grid_x, int *grid_y) const
const ICOORD & tright() const
void DisplayBoxes(ScrollView *window)
void Init(int gridsize, const ICOORD &bleft, const ICOORD &tright)
void InsertBBox(bool h_spread, bool v_spread, BBC *bbox)
void ClearGridData(void(*free_method)(BBC *))
ScrollView * MakeWindow(int x, int y, const char *window_name)
void set_space_to_left(int space)
BlobTextFlowType flow() const
bool MatchingStrokeWidth(const ColPartition &other, double fractional_tolerance, double constant_tolerance) const
bool IsHorizontalLine() const
PolyBlockType type() const
ColPartition * CopyButDontOwnBlobs()
ColPartition * SplitAt(int split_x)
int median_bottom() const
bool inside_table_column()
ColPartition * nearest_neighbor_below() const
bool VSignificantCoreOverlap(const ColPartition &other) const
void set_nearest_neighbor_above(ColPartition *part)
BlobRegionType blob_type() const
void AddBox(BLOBNBOX *box)
void set_blob_type(BlobRegionType t)
void set_nearest_neighbor_below(ColPartition *part)
int space_to_left() const
void set_space_above(int space)
void set_inside_table_column(bool val)
bool MatchingSizes(const ColPartition &other) const
void set_space_to_right(int space)
ColPartition * ShallowCopy() const
bool IsInSameColumnAs(const ColPartition &part) const
int space_to_right() const
ColPartition * nearest_neighbor_above() const
ColPartition * SingletonPartner(bool upper)
const TBOX & bounding_box() const
void SetPartitionType(int resolution, ColPartitionSet *columns)
void set_space_below(int space)
int RightAtY(int y) const
void Absorb(ColPartition *other, WidthCallback *cb)
int median_height() const
void set_flow(BlobTextFlowType f)
void RefinePartitionPartners(bool get_desperate)
void FindPartitionPartners()
ColPartition * ColumnContaining(int x, int y)
void GetColumnBoxes(int y_bottom, int y_top, ColSegment_LIST *segments)
void InsertBox(const TBOX &other)
void set_bounding_box(const TBOX &other)
void set_num_table_cells(int n)
void set_num_text_cells(int n)
ScrollView::Color BoxColor() const
const TBOX & bounding_box() const
void DisplayColSegments(ScrollView *win, ColSegment_LIST *cols, ScrollView::Color color)
bool BelongToOneTable(const TBOX &box1, const TBOX &box2)
void GrowTableBox(const TBOX &table_box, TBOX *result_box)
void MakeTableBlocks(ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb)
ScrollView * MakeWindow(int x, int y, const char *window_name)
int global_median_ledding_
void InsertFragmentedTextPartition(ColPartition *part)
void IncludeLeftOutColumnHeaders(TBOX *table_box)
void FilterHeaderAndFooter()
void Init(int grid_size, const ICOORD &bottom_left, const ICOORD &top_right)
void AdjustTableBoundaries()
const ICOORD & bleft() const
void GrowTableToIncludeLines(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
void GetTableColumns(ColSegment_LIST *table_columns)
void LocateTables(ColPartitionGrid *grid, ColPartitionSet **columns, WidthCallback *width_cb, const FCOORD &reskew)
const ICOORD & tright() const
void SplitAndInsertFragmentedTextPartition(ColPartition *part)
void GroupColumnBlocks(ColSegment_LIST *current_segments, ColSegment_LIST *col_segments)
bool ConsecutiveBoxes(const TBOX &b1, const TBOX &b2)
void SetGlobalSpacings(ColPartitionGrid *grid)
bool left_to_right_language_
void GetTableRegions(ColSegment_LIST *table_columns, ColSegment_LIST *table_regions)
bool HasWideOrNoInterWordGap(ColPartition *part) const
void FilterParagraphEndings()
void InitializePartitions(ColPartitionSet **all_columns)
bool HasLeaderAdjacent(const ColPartition &part)
bool HLineBelongsToTable(const ColPartition &part, const TBOX &table_box)
int global_median_blob_width_
void GetColumnBlocks(ColPartitionSet **columns, ColSegment_LIST *col_segments)
void set_global_median_blob_width(int width)
void DisplayColSegmentGrid(ScrollView *win, ColSegmentGrid *grid, ScrollView::Color color)
void GridMergeColumnBlocks()
void MarkTablePartitions()
ColPartitionGrid leader_and_ruling_grid_
void MoveColSegmentsToGrid(ColSegment_LIST *segments, ColSegmentGrid *col_seg_grid)
void InsertLeaderPartition(ColPartition *part)
bool GapInXProjection(int *xprojection, int length)
void set_global_median_xheight(int xheight)
void GridMergeTableRegions()
ColSegmentGrid col_seg_grid_
void InsertCleanPartitions(ColPartitionGrid *grid, TO_BLOCK *block)
static void SetPartitionSpacings(ColPartitionGrid *grid, ColPartitionSet **all_columns)
void set_global_median_ledding(int ledding)
void InsertRulingPartition(ColPartition *part)
void set_left_to_right_language(bool order)
bool AllowBlob(const BLOBNBOX &blob) const
int global_median_xheight_
ColSegmentGrid table_grid_
void SetColumnsType(ColSegment_LIST *col_segments)
void GrowTableToIncludePartials(const TBOX &table_box, const TBOX &search_range, TBOX *result_box)
void DeleteSingleColumnTables()
ColPartitionGrid fragmented_text_grid_
void InsertTextPartition(ColPartition *part)
void SetVerticalSpacing(ColPartition *part)
void DisplayColPartitionConnections(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color default_color)
void SmoothTablePartitionRuns()
void DisplayColPartitions(ScrollView *win, ColPartitionGrid *grid, ScrollView::Color text_color, ScrollView::Color table_color)
void MarkPartitionsUsingLocalInformation()
ColPartitionGrid clean_part_grid_
void InsertImagePartition(ColPartition *part)
bool AllowTextPartition(const ColPartition &part) const
const TBOX & bounding_box() const
void Display(ScrollView *window, ScrollView::Color color)
void set_max_text_height(int height)
void set_line_grid(ColPartitionGrid *lines)
void set_text_grid(ColPartitionGrid *text)
void set_min_height(int height)
StructuredTable * RecognizeTable(const TBOX &guess_box)
void Line(int x1, int y1, int x2, int y2)
void Rectangle(int x1, int y1, int x2, int y2)