tesseract
4.1.1
cluster.h
Go to the documentation of this file.
1
/******************************************************************************
2
** Filename: cluster.h
3
** Purpose: Definition of feature space clustering routines
4
** Author: Dan Johnson
5
**
6
** (c) Copyright Hewlett-Packard Company, 1988.
7
** Licensed under the Apache License, Version 2.0 (the "License");
8
** you may not use this file except in compliance with the License.
9
** You may obtain a copy of the License at
10
** http://www.apache.org/licenses/LICENSE-2.0
11
** Unless required by applicable law or agreed to in writing, software
12
** distributed under the License is distributed on an "AS IS" BASIS,
13
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
** See the License for the specific language governing permissions and
15
** limitations under the License.
16
*****************************************************************************/
17
18
#ifndef CLUSTER_H
19
#define CLUSTER_H
20
21
#include "
kdtree.h
"
22
#include "
oldlist.h
"
23
24
struct
BUCKETS
;
25
26
#define MINBUCKETS 5
27
#define MAXBUCKETS 39
28
29
/*----------------------------------------------------------------------
30
Types
31
----------------------------------------------------------------------*/
32
typedef
struct
sample
{
33
bool
Clustered
: 1;
// true if included in a higher cluster
34
bool
Prototype
: 1;
// true if cluster represented by a proto
35
unsigned
SampleCount
: 30;
// number of samples in this cluster
36
struct
sample
*
Left
;
// ptr to left sub-cluster
37
struct
sample
*
Right
;
// ptr to right sub-cluster
38
int32_t
CharID
;
// identifier of char sample came from
39
float
Mean
[1];
// mean of cluster - SampleSize floats
40
}
CLUSTER
;
41
42
using
SAMPLE
=
CLUSTER
;
// can refer to as either sample or cluster
43
44
typedef
enum
{
spherical
,
elliptical
,
mixed
,
automatic
}
PROTOSTYLE
;
45
46
typedef
struct
{
// parameters to control clustering
47
PROTOSTYLE
ProtoStyle
;
// specifies types of protos to be made
48
float
MinSamples
;
// min # of samples per proto - % of total
49
float
MaxIllegal
;
// max percentage of samples in a cluster which
50
// have more than 1 feature in that cluster
51
float
Independence
;
// desired independence between dimensions
52
double
Confidence
;
// desired confidence in prototypes created
53
int
MagicSamples
;
// Ideal number of samples in a cluster.
54
}
CLUSTERCONFIG
;
55
56
typedef
enum
{
normal
,
uniform
,
D_random
,
DISTRIBUTION_COUNT
}
DISTRIBUTION
;
57
58
typedef
union
{
59
float
Spherical
;
60
float
*
Elliptical
;
61
}
FLOATUNION
;
62
63
typedef
struct
{
64
bool
Significant
: 1;
// true if prototype is significant
65
bool
Merged
: 1;
// Merged after clustering so do not output
66
// but kept for display purposes. If it has no
67
// samples then it was actually merged.
68
// Otherwise it matched an already significant
69
// cluster.
70
unsigned
Style
: 2;
// spherical, elliptical, or mixed
71
unsigned
NumSamples
: 28;
// number of samples in the cluster
72
CLUSTER
*
Cluster
;
// ptr to cluster which made prototype
73
DISTRIBUTION
*
Distrib
;
// different distribution for each dimension
74
float
*
Mean
;
// prototype mean
75
float
TotalMagnitude
;
// total magnitude over all dimensions
76
float
LogMagnitude
;
// log base e of TotalMagnitude
77
FLOATUNION
Variance
;
// prototype variance
78
FLOATUNION
Magnitude
;
// magnitude of density function
79
FLOATUNION
Weight
;
// weight of density function
80
}
PROTOTYPE
;
81
82
typedef
struct
{
83
int16_t
SampleSize
;
// number of parameters per sample
84
PARAM_DESC
*
ParamDesc
;
// description of each parameter
85
int32_t
NumberOfSamples
;
// total number of samples being clustered
86
KDTREE
*
KDTree
;
// for optimal nearest neighbor searching
87
CLUSTER
*
Root
;
// ptr to root cluster of cluster tree
88
LIST
ProtoList
;
// list of prototypes
89
int32_t
NumChar
;
// # of characters represented by samples
90
// cache of reusable histograms by distribution type and number of buckets.
91
BUCKETS
* bucket_cache[
DISTRIBUTION_COUNT
][
MAXBUCKETS
+ 1 -
MINBUCKETS
];
92
}
CLUSTERER
;
93
94
typedef
struct
{
95
int32_t
NumSamples
;
// number of samples in list
96
int32_t
MaxNumSamples
;
// maximum size of list
97
SAMPLE
* Sample[1];
// array of ptrs to sample data structures
98
}
SAMPLELIST
;
99
100
// low level cluster tree analysis routines.
101
#define InitSampleSearch(S, C) \
102
(((C) == nullptr) ? (S = NIL_LIST) : (S = push(NIL_LIST, (C))))
103
104
/*--------------------------------------------------------------------------
105
Public Function Prototypes
106
--------------------------------------------------------------------------*/
107
CLUSTERER
*
MakeClusterer
(int16_t SampleSize,
const
PARAM_DESC
ParamDesc[]);
108
109
SAMPLE
*
MakeSample
(
CLUSTERER
* Clusterer,
const
float
* Feature, int32_t CharID);
110
111
LIST
ClusterSamples
(
CLUSTERER
* Clusterer,
CLUSTERCONFIG
*
Config
);
112
113
void
FreeClusterer
(
CLUSTERER
* Clusterer);
114
115
void
FreeProtoList
(
LIST
* ProtoList);
116
117
void
FreePrototype
(
void
* arg);
// PROTOTYPE *Prototype);
118
119
CLUSTER
*
NextSample
(
LIST
* SearchState);
120
121
float
Mean
(
PROTOTYPE
* Proto, uint16_t Dimension);
122
123
float
StandardDeviation
(
PROTOTYPE
* Proto, uint16_t Dimension);
124
125
int32_t
MergeClusters
(int16_t N,
PARAM_DESC
ParamDesc[], int32_t n1, int32_t n2,
126
float
m[],
float
m1[],
float
m2[]);
127
128
#endif
KDTREE
Definition:
kdtree.h:48
SAMPLELIST
Definition:
cluster.h:94
CLUSTERER::ProtoList
LIST ProtoList
Definition:
cluster.h:88
sample::Clustered
bool Clustered
Definition:
cluster.h:33
PROTOTYPE::Distrib
DISTRIBUTION * Distrib
Definition:
cluster.h:73
FLOATUNION
Definition:
cluster.h:58
PROTOTYPE::TotalMagnitude
float TotalMagnitude
Definition:
cluster.h:75
CLUSTERCONFIG::MagicSamples
int MagicSamples
Definition:
cluster.h:53
FLOATUNION::Elliptical
float * Elliptical
Definition:
cluster.h:60
CLUSTERCONFIG::MaxIllegal
float MaxIllegal
Definition:
cluster.h:49
PROTOTYPE::LogMagnitude
float LogMagnitude
Definition:
cluster.h:76
PROTOTYPE::NumSamples
unsigned NumSamples
Definition:
cluster.h:71
mixed
@ mixed
Definition:
cluster.h:44
PROTOTYPE::Magnitude
FLOATUNION Magnitude
Definition:
cluster.h:78
CLUSTERCONFIG
Definition:
cluster.h:46
FLOATUNION::Spherical
float Spherical
Definition:
cluster.h:59
sample::Mean
float Mean[1]
Definition:
cluster.h:39
CLUSTERER::NumberOfSamples
int32_t NumberOfSamples
Definition:
cluster.h:85
sample::Left
struct sample * Left
Definition:
cluster.h:36
PROTOTYPE::Cluster
CLUSTER * Cluster
Definition:
cluster.h:72
spherical
@ spherical
Definition:
cluster.h:44
MINBUCKETS
#define MINBUCKETS
Definition:
cluster.h:26
CLUSTERCONFIG::ProtoStyle
PROTOSTYLE ProtoStyle
Definition:
cluster.h:47
MergeClusters
int32_t MergeClusters(int16_t N, PARAM_DESC ParamDesc[], int32_t n1, int32_t n2, float m[], float m1[], float m2[])
Definition:
cluster.cpp:824
sample
Definition:
cluster.h:32
sample::SampleCount
unsigned SampleCount
Definition:
cluster.h:35
BUCKETS
Definition:
cluster.cpp:179
kdtree.h
StandardDeviation
float StandardDeviation(PROTOTYPE *Proto, uint16_t Dimension)
Definition:
cluster.cpp:613
FreeProtoList
void FreeProtoList(LIST *ProtoList)
Definition:
cluster.cpp:538
CLUSTERCONFIG::MinSamples
float MinSamples
Definition:
cluster.h:48
sample::Right
struct sample * Right
Definition:
cluster.h:37
sample::Prototype
bool Prototype
Definition:
cluster.h:34
CLUSTER
struct sample CLUSTER
Mean
float Mean(PROTOTYPE *Proto, uint16_t Dimension)
Definition:
cluster.cpp:602
CLUSTERER::SampleSize
int16_t SampleSize
Definition:
cluster.h:83
DISTRIBUTION_COUNT
@ DISTRIBUTION_COUNT
Definition:
cluster.h:56
CLUSTERER::KDTree
KDTREE * KDTree
Definition:
cluster.h:86
NextSample
CLUSTER * NextSample(LIST *SearchState)
Definition:
cluster.cpp:580
MakeClusterer
CLUSTERER * MakeClusterer(int16_t SampleSize, const PARAM_DESC ParamDesc[])
Definition:
cluster.cpp:376
ClusterSamples
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition:
cluster.cpp:483
SAMPLELIST::MaxNumSamples
int32_t MaxNumSamples
Definition:
cluster.h:96
FreeClusterer
void FreeClusterer(CLUSTERER *Clusterer)
Definition:
cluster.cpp:514
CLUSTERER
Definition:
cluster.h:82
automatic
@ automatic
Definition:
cluster.h:44
D_random
@ D_random
Definition:
cluster.h:56
CLUSTERCONFIG::Confidence
double Confidence
Definition:
cluster.h:52
PROTOTYPE::Variance
FLOATUNION Variance
Definition:
cluster.h:77
MakeSample
SAMPLE * MakeSample(CLUSTERER *Clusterer, const float *Feature, int32_t CharID)
Definition:
cluster.cpp:429
FreePrototype
void FreePrototype(void *arg)
Definition:
cluster.cpp:549
CLUSTERER::ParamDesc
PARAM_DESC * ParamDesc
Definition:
cluster.h:84
CLUSTERER::NumChar
int32_t NumChar
Definition:
cluster.h:89
PROTOTYPE::Significant
bool Significant
Definition:
cluster.h:64
normal
@ normal
Definition:
cluster.h:56
sample::CharID
int32_t CharID
Definition:
cluster.h:38
PROTOTYPE::Mean
float * Mean
Definition:
cluster.h:74
PARAM_DESC
Definition:
ocrfeatures.h:42
elliptical
@ elliptical
Definition:
cluster.h:44
PROTOTYPE::Weight
FLOATUNION Weight
Definition:
cluster.h:79
PROTOTYPE::Merged
bool Merged
Definition:
cluster.h:65
uniform
@ uniform
Definition:
cluster.h:56
PROTOTYPE
Definition:
cluster.h:63
CLUSTERER::Root
CLUSTER * Root
Definition:
cluster.h:87
list_rec
Definition:
oldlist.h:81
SAMPLELIST::NumSamples
int32_t NumSamples
Definition:
cluster.h:95
oldlist.h
DISTRIBUTION
DISTRIBUTION
Definition:
cluster.h:56
MAXBUCKETS
#define MAXBUCKETS
Definition:
cluster.h:27
PROTOSTYLE
PROTOSTYLE
Definition:
cluster.h:44
PROTOTYPE::Style
unsigned Style
Definition:
cluster.h:70
CLUSTERCONFIG::Independence
float Independence
Definition:
cluster.h:51
Config
CLUSTERCONFIG Config
Definition:
commontraining.cpp:88
src
classify
cluster.h
Generated on Thu Mar 26 2020 00:00:00 for tesseract by
1.8.18