Skip to content

Commit 28846b3

Browse files
authored
Moved JSON export code from JavaScript to C++ (#984)
1 parent 6cf46c1 commit 28846b3

File tree

5 files changed

+37
-161
lines changed

5 files changed

+37
-161
lines changed

package-lock.json

Lines changed: 8 additions & 7 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@
6868
"node-fetch": "^2.6.9",
6969
"opencollective-postinstall": "^2.0.3",
7070
"regenerator-runtime": "^0.13.3",
71-
"tesseract.js-core": "^5.1.1",
71+
"tesseract.js-core": "^6.0.0-0",
7272
"wasm-feature-detect": "^1.2.11",
7373
"zlibjs": "^0.3.1"
7474
},

src/worker-script/utils/dump.js

Lines changed: 3 additions & 153 deletions
Original file line numberDiff line numberDiff line change
@@ -40,21 +40,6 @@ const deindent = (html) => {
4040
* @access public
4141
*/
4242
module.exports = (TessModule, api, output, options) => {
43-
const ri = api.GetIterator();
44-
const {
45-
RIL_BLOCK,
46-
RIL_PARA,
47-
RIL_TEXTLINE,
48-
RIL_WORD,
49-
RIL_SYMBOL,
50-
} = TessModule;
51-
const blocks = [];
52-
let block;
53-
let para;
54-
let textline;
55-
let word;
56-
let symbol;
57-
5843
const enumToString = (value, prefix) => (
5944
Object.keys(TessModule)
6045
.filter((e) => (e.startsWith(`${prefix}_`) && TessModule[e] === value))
@@ -79,142 +64,6 @@ module.exports = (TessModule, api, output, options) => {
7964
return TessModule.FS.readFile('/tesseract-ocr.pdf');
8065
};
8166

82-
// If output.layoutBlocks is true and options.skipRecognition is true,
83-
// the user wants layout data but text recognition has not been run.
84-
// In this case, fields that require text recognition are skipped.
85-
if (output.blocks || output.layoutBlocks) {
86-
ri.Begin();
87-
do {
88-
if (ri.IsAtBeginningOf(RIL_BLOCK)) {
89-
const poly = ri.BlockPolygon();
90-
let polygon = null;
91-
// BlockPolygon() returns null when automatic page segmentation is off
92-
if (TessModule.getPointer(poly) > 0) {
93-
const n = poly.get_n();
94-
const px = poly.get_x();
95-
const py = poly.get_y();
96-
polygon = [];
97-
for (let i = 0; i < n; i += 1) {
98-
polygon.push([px.getValue(i), py.getValue(i)]);
99-
}
100-
/*
101-
* TODO: find out why _ptaDestroy doesn't work
102-
*/
103-
// TessModule._ptaDestroy(TessModule.getPointer(poly));
104-
}
105-
106-
block = {
107-
paragraphs: [],
108-
text: !options.skipRecognition ? ri.GetUTF8Text(RIL_BLOCK) : null,
109-
confidence: !options.skipRecognition ? ri.Confidence(RIL_BLOCK) : null,
110-
baseline: ri.getBaseline(RIL_BLOCK),
111-
bbox: ri.getBoundingBox(RIL_BLOCK),
112-
blocktype: enumToString(ri.BlockType(), 'PT'),
113-
polygon,
114-
};
115-
blocks.push(block);
116-
}
117-
if (ri.IsAtBeginningOf(RIL_PARA)) {
118-
para = {
119-
lines: [],
120-
text: !options.skipRecognition ? ri.GetUTF8Text(RIL_PARA) : null,
121-
confidence: !options.skipRecognition ? ri.Confidence(RIL_PARA) : null,
122-
baseline: ri.getBaseline(RIL_PARA),
123-
bbox: ri.getBoundingBox(RIL_PARA),
124-
is_ltr: !!ri.ParagraphIsLtr(),
125-
};
126-
block.paragraphs.push(para);
127-
}
128-
if (ri.IsAtBeginningOf(RIL_TEXTLINE)) {
129-
// getRowAttributes was added in a recent minor version of Tesseract.js-core,
130-
// so we need to check if it exists before calling it.
131-
// This can be removed in the next major version (v6).
132-
let rowAttributes;
133-
if (ri.getRowAttributes) {
134-
rowAttributes = ri.getRowAttributes();
135-
// Descenders is reported as a negative within Tesseract internally so we need to flip it.
136-
// The positive version is intuitive, and matches what is reported in the hOCR output.
137-
rowAttributes.descenders *= -1;
138-
}
139-
textline = {
140-
words: [],
141-
text: !options.skipRecognition ? ri.GetUTF8Text(RIL_TEXTLINE) : null,
142-
confidence: !options.skipRecognition ? ri.Confidence(RIL_TEXTLINE) : null,
143-
baseline: ri.getBaseline(RIL_TEXTLINE),
144-
rowAttributes,
145-
bbox: ri.getBoundingBox(RIL_TEXTLINE),
146-
};
147-
para.lines.push(textline);
148-
}
149-
if (ri.IsAtBeginningOf(RIL_WORD)) {
150-
const fontInfo = ri.getWordFontAttributes();
151-
const wordDir = ri.WordDirection();
152-
word = {
153-
symbols: [],
154-
choices: [],
155-
156-
text: !options.skipRecognition ? ri.GetUTF8Text(RIL_WORD) : null,
157-
confidence: !options.skipRecognition ? ri.Confidence(RIL_WORD) : null,
158-
baseline: ri.getBaseline(RIL_WORD),
159-
bbox: ri.getBoundingBox(RIL_WORD),
160-
161-
is_numeric: !!ri.WordIsNumeric(),
162-
in_dictionary: !!ri.WordIsFromDictionary(),
163-
direction: enumToString(wordDir, 'DIR'),
164-
language: ri.WordRecognitionLanguage(),
165-
166-
is_bold: fontInfo.is_bold,
167-
is_italic: fontInfo.is_italic,
168-
is_underlined: fontInfo.is_underlined,
169-
is_monospace: fontInfo.is_monospace,
170-
is_serif: fontInfo.is_serif,
171-
is_smallcaps: fontInfo.is_smallcaps,
172-
font_size: fontInfo.pointsize,
173-
font_id: fontInfo.font_id,
174-
font_name: fontInfo.font_name,
175-
};
176-
const wc = new TessModule.WordChoiceIterator(ri);
177-
do {
178-
word.choices.push({
179-
text: !options.skipRecognition ? wc.GetUTF8Text() : null,
180-
confidence: !options.skipRecognition ? wc.Confidence() : null,
181-
});
182-
} while (wc.Next());
183-
TessModule.destroy(wc);
184-
textline.words.push(word);
185-
}
186-
187-
// let image = null;
188-
// var pix = ri.GetBinaryImage(TessModule.RIL_SYMBOL)
189-
// var image = pix2array(pix);
190-
// // for some reason it seems that things stop working if you destroy pics
191-
// TessModule._pixDestroy(TessModule.getPointer(pix));
192-
if (ri.IsAtBeginningOf(RIL_SYMBOL)) {
193-
symbol = {
194-
choices: [],
195-
image: null,
196-
text: !options.skipRecognition ? ri.GetUTF8Text(RIL_SYMBOL) : null,
197-
confidence: !options.skipRecognition ? ri.Confidence(RIL_SYMBOL) : null,
198-
baseline: ri.getBaseline(RIL_SYMBOL),
199-
bbox: ri.getBoundingBox(RIL_SYMBOL),
200-
is_superscript: !!ri.SymbolIsSuperscript(),
201-
is_subscript: !!ri.SymbolIsSubscript(),
202-
is_dropcap: !!ri.SymbolIsDropcap(),
203-
};
204-
word.symbols.push(symbol);
205-
const ci = new TessModule.ChoiceIterator(ri);
206-
do {
207-
symbol.choices.push({
208-
text: !options.skipRecognition ? ci.GetUTF8Text() : null,
209-
confidence: !options.skipRecognition ? ci.Confidence() : null,
210-
});
211-
} while (ci.Next());
212-
// TessModule.destroy(i);
213-
}
214-
} while (ri.Next(RIL_SYMBOL));
215-
TessModule.destroy(ri);
216-
}
217-
21867
return {
21968
text: output.text ? api.GetUTF8Text() : null,
22069
hocr: output.hocr ? deindent(api.GetHOCRText()) : null,
@@ -227,8 +76,9 @@ module.exports = (TessModule, api, output, options) => {
22776
imageGrey: output.imageGrey ? getImage(imageType.GREY) : null,
22877
imageBinary: output.imageBinary ? getImage(imageType.BINARY) : null,
22978
confidence: !options.skipRecognition ? api.MeanTextConf() : null,
230-
blocks: output.blocks && !options.skipRecognition ? blocks : null,
231-
layoutBlocks: output.layoutBlocks && options.skipRecognition ? blocks : null,
79+
blocks: output.blocks && !options.skipRecognition ? JSON.parse(api.GetJSONText()).blocks : null,
80+
layoutBlocks: output.layoutBlocks && options.skipRecognition
81+
? JSON.parse(api.GetJSONText()).blocks : null,
23282
psm: enumToString(api.GetPageSegMode(), 'PSM'),
23383
oem: enumToString(api.oem(), 'OEM'),
23484
version: api.Version(),

tests/assets/images/escape_chars.png

10.7 KB
Loading

tests/recognize.test.js

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,4 +269,29 @@ describe('recognize()', () => {
269269
}).timeout(TIMEOUT)
270270
));
271271
});
272+
273+
describe('should support blocks (json) output', () => {
274+
it('recongize large image', async () => {
275+
await worker.reinitialize('eng');
276+
const { data: { blocks } } = await worker.recognize(`${IMAGE_PATH}/testocr.png`, {}, { blocks: true });
277+
expect(blocks[0].paragraphs[0].lines[0].words[0].symbols[0].text).to.be('T');
278+
expect(blocks[0].paragraphs[0].lines[0].words[0].text).to.be('This');
279+
expect(blocks[0].paragraphs[0].lines[0].text).to.be('This is a lot of 12 point text to test the\n');
280+
}).timeout(TIMEOUT);
281+
282+
it('recongize image with special characters', async () => {
283+
await worker.reinitialize('eng');
284+
const { data: { blocks } } = await worker.recognize(`${IMAGE_PATH}/escape_chars.png`, {}, { blocks: true });
285+
expect(blocks[0].paragraphs[0].lines[0].text).to.be('"Double Quotes"\n');
286+
expect(blocks[0].paragraphs[0].lines[1].text).to.be('Back \\ Slash\n');
287+
}).timeout(TIMEOUT);
288+
289+
it('recongize chinese image', async () => {
290+
await worker.reinitialize('chi_tra');
291+
const { data: { blocks } } = await worker.recognize(`${IMAGE_PATH}/chinese.png`, {}, { blocks: true });
292+
expect(blocks[0].paragraphs[0].lines[0].words[0].symbols[0].text).to.be('繁');
293+
expect(blocks[0].paragraphs[0].lines[0].words[0].text).to.be('繁體');
294+
expect(blocks[0].paragraphs[0].lines[0].text).to.be('繁體 中 文 測試\n');
295+
}).timeout(TIMEOUT);
296+
});
272297
});

0 commit comments

Comments
 (0)