@@ -40,21 +40,6 @@ const deindent = (html) => {
40
40
* @access public
41
41
*/
42
42
module . exports = ( TessModule , api , output , options ) => {
43
- const ri = api . GetIterator ( ) ;
44
- const {
45
- RIL_BLOCK ,
46
- RIL_PARA ,
47
- RIL_TEXTLINE ,
48
- RIL_WORD ,
49
- RIL_SYMBOL ,
50
- } = TessModule ;
51
- const blocks = [ ] ;
52
- let block ;
53
- let para ;
54
- let textline ;
55
- let word ;
56
- let symbol ;
57
-
58
43
const enumToString = ( value , prefix ) => (
59
44
Object . keys ( TessModule )
60
45
. filter ( ( e ) => ( e . startsWith ( `${ prefix } _` ) && TessModule [ e ] === value ) )
@@ -79,142 +64,6 @@ module.exports = (TessModule, api, output, options) => {
79
64
return TessModule . FS . readFile ( '/tesseract-ocr.pdf' ) ;
80
65
} ;
81
66
82
- // If output.layoutBlocks is true and options.skipRecognition is true,
83
- // the user wants layout data but text recognition has not been run.
84
- // In this case, fields that require text recognition are skipped.
85
- if ( output . blocks || output . layoutBlocks ) {
86
- ri . Begin ( ) ;
87
- do {
88
- if ( ri . IsAtBeginningOf ( RIL_BLOCK ) ) {
89
- const poly = ri . BlockPolygon ( ) ;
90
- let polygon = null ;
91
- // BlockPolygon() returns null when automatic page segmentation is off
92
- if ( TessModule . getPointer ( poly ) > 0 ) {
93
- const n = poly . get_n ( ) ;
94
- const px = poly . get_x ( ) ;
95
- const py = poly . get_y ( ) ;
96
- polygon = [ ] ;
97
- for ( let i = 0 ; i < n ; i += 1 ) {
98
- polygon . push ( [ px . getValue ( i ) , py . getValue ( i ) ] ) ;
99
- }
100
- /*
101
- * TODO: find out why _ptaDestroy doesn't work
102
- */
103
- // TessModule._ptaDestroy(TessModule.getPointer(poly));
104
- }
105
-
106
- block = {
107
- paragraphs : [ ] ,
108
- text : ! options . skipRecognition ? ri . GetUTF8Text ( RIL_BLOCK ) : null ,
109
- confidence : ! options . skipRecognition ? ri . Confidence ( RIL_BLOCK ) : null ,
110
- baseline : ri . getBaseline ( RIL_BLOCK ) ,
111
- bbox : ri . getBoundingBox ( RIL_BLOCK ) ,
112
- blocktype : enumToString ( ri . BlockType ( ) , 'PT' ) ,
113
- polygon,
114
- } ;
115
- blocks . push ( block ) ;
116
- }
117
- if ( ri . IsAtBeginningOf ( RIL_PARA ) ) {
118
- para = {
119
- lines : [ ] ,
120
- text : ! options . skipRecognition ? ri . GetUTF8Text ( RIL_PARA ) : null ,
121
- confidence : ! options . skipRecognition ? ri . Confidence ( RIL_PARA ) : null ,
122
- baseline : ri . getBaseline ( RIL_PARA ) ,
123
- bbox : ri . getBoundingBox ( RIL_PARA ) ,
124
- is_ltr : ! ! ri . ParagraphIsLtr ( ) ,
125
- } ;
126
- block . paragraphs . push ( para ) ;
127
- }
128
- if ( ri . IsAtBeginningOf ( RIL_TEXTLINE ) ) {
129
- // getRowAttributes was added in a recent minor version of Tesseract.js-core,
130
- // so we need to check if it exists before calling it.
131
- // This can be removed in the next major version (v6).
132
- let rowAttributes ;
133
- if ( ri . getRowAttributes ) {
134
- rowAttributes = ri . getRowAttributes ( ) ;
135
- // Descenders is reported as a negative within Tesseract internally so we need to flip it.
136
- // The positive version is intuitive, and matches what is reported in the hOCR output.
137
- rowAttributes . descenders *= - 1 ;
138
- }
139
- textline = {
140
- words : [ ] ,
141
- text : ! options . skipRecognition ? ri . GetUTF8Text ( RIL_TEXTLINE ) : null ,
142
- confidence : ! options . skipRecognition ? ri . Confidence ( RIL_TEXTLINE ) : null ,
143
- baseline : ri . getBaseline ( RIL_TEXTLINE ) ,
144
- rowAttributes,
145
- bbox : ri . getBoundingBox ( RIL_TEXTLINE ) ,
146
- } ;
147
- para . lines . push ( textline ) ;
148
- }
149
- if ( ri . IsAtBeginningOf ( RIL_WORD ) ) {
150
- const fontInfo = ri . getWordFontAttributes ( ) ;
151
- const wordDir = ri . WordDirection ( ) ;
152
- word = {
153
- symbols : [ ] ,
154
- choices : [ ] ,
155
-
156
- text : ! options . skipRecognition ? ri . GetUTF8Text ( RIL_WORD ) : null ,
157
- confidence : ! options . skipRecognition ? ri . Confidence ( RIL_WORD ) : null ,
158
- baseline : ri . getBaseline ( RIL_WORD ) ,
159
- bbox : ri . getBoundingBox ( RIL_WORD ) ,
160
-
161
- is_numeric : ! ! ri . WordIsNumeric ( ) ,
162
- in_dictionary : ! ! ri . WordIsFromDictionary ( ) ,
163
- direction : enumToString ( wordDir , 'DIR' ) ,
164
- language : ri . WordRecognitionLanguage ( ) ,
165
-
166
- is_bold : fontInfo . is_bold ,
167
- is_italic : fontInfo . is_italic ,
168
- is_underlined : fontInfo . is_underlined ,
169
- is_monospace : fontInfo . is_monospace ,
170
- is_serif : fontInfo . is_serif ,
171
- is_smallcaps : fontInfo . is_smallcaps ,
172
- font_size : fontInfo . pointsize ,
173
- font_id : fontInfo . font_id ,
174
- font_name : fontInfo . font_name ,
175
- } ;
176
- const wc = new TessModule . WordChoiceIterator ( ri ) ;
177
- do {
178
- word . choices . push ( {
179
- text : ! options . skipRecognition ? wc . GetUTF8Text ( ) : null ,
180
- confidence : ! options . skipRecognition ? wc . Confidence ( ) : null ,
181
- } ) ;
182
- } while ( wc . Next ( ) ) ;
183
- TessModule . destroy ( wc ) ;
184
- textline . words . push ( word ) ;
185
- }
186
-
187
- // let image = null;
188
- // var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL)
189
- // var image = pix2array(pix);
190
- // // for some reason it seems that things stop working if you destroy pics
191
- // TessModule._pixDestroy(TessModule.getPointer(pix));
192
- if ( ri . IsAtBeginningOf ( RIL_SYMBOL ) ) {
193
- symbol = {
194
- choices : [ ] ,
195
- image : null ,
196
- text : ! options . skipRecognition ? ri . GetUTF8Text ( RIL_SYMBOL ) : null ,
197
- confidence : ! options . skipRecognition ? ri . Confidence ( RIL_SYMBOL ) : null ,
198
- baseline : ri . getBaseline ( RIL_SYMBOL ) ,
199
- bbox : ri . getBoundingBox ( RIL_SYMBOL ) ,
200
- is_superscript : ! ! ri . SymbolIsSuperscript ( ) ,
201
- is_subscript : ! ! ri . SymbolIsSubscript ( ) ,
202
- is_dropcap : ! ! ri . SymbolIsDropcap ( ) ,
203
- } ;
204
- word . symbols . push ( symbol ) ;
205
- const ci = new TessModule . ChoiceIterator ( ri ) ;
206
- do {
207
- symbol . choices . push ( {
208
- text : ! options . skipRecognition ? ci . GetUTF8Text ( ) : null ,
209
- confidence : ! options . skipRecognition ? ci . Confidence ( ) : null ,
210
- } ) ;
211
- } while ( ci . Next ( ) ) ;
212
- // TessModule.destroy(i);
213
- }
214
- } while ( ri . Next ( RIL_SYMBOL ) ) ;
215
- TessModule . destroy ( ri ) ;
216
- }
217
-
218
67
return {
219
68
text : output . text ? api . GetUTF8Text ( ) : null ,
220
69
hocr : output . hocr ? deindent ( api . GetHOCRText ( ) ) : null ,
@@ -227,8 +76,9 @@ module.exports = (TessModule, api, output, options) => {
227
76
imageGrey : output . imageGrey ? getImage ( imageType . GREY ) : null ,
228
77
imageBinary : output . imageBinary ? getImage ( imageType . BINARY ) : null ,
229
78
confidence : ! options . skipRecognition ? api . MeanTextConf ( ) : null ,
230
- blocks : output . blocks && ! options . skipRecognition ? blocks : null ,
231
- layoutBlocks : output . layoutBlocks && options . skipRecognition ? blocks : null ,
79
+ blocks : output . blocks && ! options . skipRecognition ? JSON . parse ( api . GetJSONText ( ) ) . blocks : null ,
80
+ layoutBlocks : output . layoutBlocks && options . skipRecognition
81
+ ? JSON . parse ( api . GetJSONText ( ) ) . blocks : null ,
232
82
psm : enumToString ( api . GetPageSegMode ( ) , 'PSM' ) ,
233
83
oem : enumToString ( api . oem ( ) , 'OEM' ) ,
234
84
version : api . Version ( ) ,
0 commit comments