@@ -40,21 +40,6 @@ const deindent = (html) => {
4040 * @access public
4141 */
4242module . exports = ( TessModule , api , output , options ) => {
43- const ri = api . GetIterator ( ) ;
44- const {
45- RIL_BLOCK ,
46- RIL_PARA ,
47- RIL_TEXTLINE ,
48- RIL_WORD ,
49- RIL_SYMBOL ,
50- } = TessModule ;
51- const blocks = [ ] ;
52- let block ;
53- let para ;
54- let textline ;
55- let word ;
56- let symbol ;
57-
5843 const enumToString = ( value , prefix ) => (
5944 Object . keys ( TessModule )
6045 . filter ( ( e ) => ( e . startsWith ( `${ prefix } _` ) && TessModule [ e ] === value ) )
@@ -79,142 +64,6 @@ module.exports = (TessModule, api, output, options) => {
7964 return TessModule . FS . readFile ( '/tesseract-ocr.pdf' ) ;
8065 } ;
8166
82- // If output.layoutBlocks is true and options.skipRecognition is true,
83- // the user wants layout data but text recognition has not been run.
84- // In this case, fields that require text recognition are skipped.
85- if ( output . blocks || output . layoutBlocks ) {
86- ri . Begin ( ) ;
87- do {
88- if ( ri . IsAtBeginningOf ( RIL_BLOCK ) ) {
89- const poly = ri . BlockPolygon ( ) ;
90- let polygon = null ;
91- // BlockPolygon() returns null when automatic page segmentation is off
92- if ( TessModule . getPointer ( poly ) > 0 ) {
93- const n = poly . get_n ( ) ;
94- const px = poly . get_x ( ) ;
95- const py = poly . get_y ( ) ;
96- polygon = [ ] ;
97- for ( let i = 0 ; i < n ; i += 1 ) {
98- polygon . push ( [ px . getValue ( i ) , py . getValue ( i ) ] ) ;
99- }
100- /*
101- * TODO: find out why _ptaDestroy doesn't work
102- */
103- // TessModule._ptaDestroy(TessModule.getPointer(poly));
104- }
105-
106- block = {
107- paragraphs : [ ] ,
108- text : ! options . skipRecognition ? ri . GetUTF8Text ( RIL_BLOCK ) : null ,
109- confidence : ! options . skipRecognition ? ri . Confidence ( RIL_BLOCK ) : null ,
110- baseline : ri . getBaseline ( RIL_BLOCK ) ,
111- bbox : ri . getBoundingBox ( RIL_BLOCK ) ,
112- blocktype : enumToString ( ri . BlockType ( ) , 'PT' ) ,
113- polygon,
114- } ;
115- blocks . push ( block ) ;
116- }
117- if ( ri . IsAtBeginningOf ( RIL_PARA ) ) {
118- para = {
119- lines : [ ] ,
120- text : ! options . skipRecognition ? ri . GetUTF8Text ( RIL_PARA ) : null ,
121- confidence : ! options . skipRecognition ? ri . Confidence ( RIL_PARA ) : null ,
122- baseline : ri . getBaseline ( RIL_PARA ) ,
123- bbox : ri . getBoundingBox ( RIL_PARA ) ,
124- is_ltr : ! ! ri . ParagraphIsLtr ( ) ,
125- } ;
126- block . paragraphs . push ( para ) ;
127- }
128- if ( ri . IsAtBeginningOf ( RIL_TEXTLINE ) ) {
129- // getRowAttributes was added in a recent minor version of Tesseract.js-core,
130- // so we need to check if it exists before calling it.
131- // This can be removed in the next major version (v6).
132- let rowAttributes ;
133- if ( ri . getRowAttributes ) {
134- rowAttributes = ri . getRowAttributes ( ) ;
135- // Descenders is reported as a negative within Tesseract internally so we need to flip it.
136- // The positive version is intuitive, and matches what is reported in the hOCR output.
137- rowAttributes . descenders *= - 1 ;
138- }
139- textline = {
140- words : [ ] ,
141- text : ! options . skipRecognition ? ri . GetUTF8Text ( RIL_TEXTLINE ) : null ,
142- confidence : ! options . skipRecognition ? ri . Confidence ( RIL_TEXTLINE ) : null ,
143- baseline : ri . getBaseline ( RIL_TEXTLINE ) ,
144- rowAttributes,
145- bbox : ri . getBoundingBox ( RIL_TEXTLINE ) ,
146- } ;
147- para . lines . push ( textline ) ;
148- }
149- if ( ri . IsAtBeginningOf ( RIL_WORD ) ) {
150- const fontInfo = ri . getWordFontAttributes ( ) ;
151- const wordDir = ri . WordDirection ( ) ;
152- word = {
153- symbols : [ ] ,
154- choices : [ ] ,
155-
156- text : ! options . skipRecognition ? ri . GetUTF8Text ( RIL_WORD ) : null ,
157- confidence : ! options . skipRecognition ? ri . Confidence ( RIL_WORD ) : null ,
158- baseline : ri . getBaseline ( RIL_WORD ) ,
159- bbox : ri . getBoundingBox ( RIL_WORD ) ,
160-
161- is_numeric : ! ! ri . WordIsNumeric ( ) ,
162- in_dictionary : ! ! ri . WordIsFromDictionary ( ) ,
163- direction : enumToString ( wordDir , 'DIR' ) ,
164- language : ri . WordRecognitionLanguage ( ) ,
165-
166- is_bold : fontInfo . is_bold ,
167- is_italic : fontInfo . is_italic ,
168- is_underlined : fontInfo . is_underlined ,
169- is_monospace : fontInfo . is_monospace ,
170- is_serif : fontInfo . is_serif ,
171- is_smallcaps : fontInfo . is_smallcaps ,
172- font_size : fontInfo . pointsize ,
173- font_id : fontInfo . font_id ,
174- font_name : fontInfo . font_name ,
175- } ;
176- const wc = new TessModule . WordChoiceIterator ( ri ) ;
177- do {
178- word . choices . push ( {
179- text : ! options . skipRecognition ? wc . GetUTF8Text ( ) : null ,
180- confidence : ! options . skipRecognition ? wc . Confidence ( ) : null ,
181- } ) ;
182- } while ( wc . Next ( ) ) ;
183- TessModule . destroy ( wc ) ;
184- textline . words . push ( word ) ;
185- }
186-
187- // let image = null;
188- // var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL)
189- // var image = pix2array(pix);
190- // // for some reason it seems that things stop working if you destroy pics
191- // TessModule._pixDestroy(TessModule.getPointer(pix));
192- if ( ri . IsAtBeginningOf ( RIL_SYMBOL ) ) {
193- symbol = {
194- choices : [ ] ,
195- image : null ,
196- text : ! options . skipRecognition ? ri . GetUTF8Text ( RIL_SYMBOL ) : null ,
197- confidence : ! options . skipRecognition ? ri . Confidence ( RIL_SYMBOL ) : null ,
198- baseline : ri . getBaseline ( RIL_SYMBOL ) ,
199- bbox : ri . getBoundingBox ( RIL_SYMBOL ) ,
200- is_superscript : ! ! ri . SymbolIsSuperscript ( ) ,
201- is_subscript : ! ! ri . SymbolIsSubscript ( ) ,
202- is_dropcap : ! ! ri . SymbolIsDropcap ( ) ,
203- } ;
204- word . symbols . push ( symbol ) ;
205- const ci = new TessModule . ChoiceIterator ( ri ) ;
206- do {
207- symbol . choices . push ( {
208- text : ! options . skipRecognition ? ci . GetUTF8Text ( ) : null ,
209- confidence : ! options . skipRecognition ? ci . Confidence ( ) : null ,
210- } ) ;
211- } while ( ci . Next ( ) ) ;
212- // TessModule.destroy(i);
213- }
214- } while ( ri . Next ( RIL_SYMBOL ) ) ;
215- TessModule . destroy ( ri ) ;
216- }
217-
21867 return {
21968 text : output . text ? api . GetUTF8Text ( ) : null ,
22069 hocr : output . hocr ? deindent ( api . GetHOCRText ( ) ) : null ,
@@ -227,8 +76,9 @@ module.exports = (TessModule, api, output, options) => {
22776 imageGrey : output . imageGrey ? getImage ( imageType . GREY ) : null ,
22877 imageBinary : output . imageBinary ? getImage ( imageType . BINARY ) : null ,
22978 confidence : ! options . skipRecognition ? api . MeanTextConf ( ) : null ,
230- blocks : output . blocks && ! options . skipRecognition ? blocks : null ,
231- layoutBlocks : output . layoutBlocks && options . skipRecognition ? blocks : null ,
79+ blocks : output . blocks && ! options . skipRecognition ? JSON . parse ( api . GetJSONText ( ) ) . blocks : null ,
80+ layoutBlocks : output . layoutBlocks && options . skipRecognition
81+ ? JSON . parse ( api . GetJSONText ( ) ) . blocks : null ,
23282 psm : enumToString ( api . GetPageSegMode ( ) , 'PSM' ) ,
23383 oem : enumToString ( api . oem ( ) , 'OEM' ) ,
23484 version : api . Version ( ) ,
0 commit comments