$extrastylesheet
Olena  User documentation 2.1
An Image Processing Platform
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
recognition.hh
1 // Copyright (C) 2009, 2010, 2011, 2013 EPITA Research and Development
2 // Laboratory (LRDE)
3 //
4 // This file is part of Olena.
5 //
6 // Olena is free software: you can redistribute it and/or modify it under
7 // the terms of the GNU General Public License as published by the Free
8 // Software Foundation, version 2 of the License.
9 //
10 // Olena is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 // General Public License for more details.
14 //
15 // You should have received a copy of the GNU General Public License
16 // along with Olena. If not, see <http://www.gnu.org/licenses/>.
17 //
18 // As a special exception, you may use this file as part of a free
19 // software project without restriction. Specifically, if other files
20 // instantiate templates or use macros or inline functions from this
21 // file, or you compile this file and link it with other files to produce
22 // an executable, this file does not by itself cause the resulting
23 // executable to be covered by the GNU General Public License. This
24 // exception does not however invalidate any other reasons why the
25 // executable file might be covered by the GNU General Public License.
26 
27 #ifndef SCRIBO_TEXT_RECOGNITION_HH
28 # define SCRIBO_TEXT_RECOGNITION_HH
29 
36 
37 # if !defined HAVE_TESSERACT_2 && !defined HAVE_TESSERACT_3
38 # define HAVE_TESSERACT_2
39 # endif
40 
41 # include <clocale>
42 
43 # include <ostream>
44 
45 # include <mln/core/image/dmorph/image_if.hh>
46 # include <mln/core/concept/neighborhood.hh>
47 # include <mln/core/site_set/box.hh>
48 
49 # include <mln/util/array.hh>
50 # include <mln/data/fill.hh>
51 # include <mln/data/paste.hh>
52 # include <mln/data/paste_without_localization.hh>
53 # include <mln/pw/all.hh>
54 
55 # include <mln/core/alias/w_window2d_int.hh>
56 # include <mln/make/w_window2d_int.hh>
57 
58 # include <mln/border/resize.hh>
59 
60 # include <scribo/core/macros.hh>
61 
62 # include <scribo/text/clean_inplace.hh>
63 
64 # include <scribo/core/line_set.hh>
65 
66 
67 # include <tesseract/baseapi.h>
68 
69 # if defined HAVE_TESSERACT_3
70 # include <tesseract/resultiterator.h>
71 # endif // ! HAVE_TESSERACT_3
72 
73 
74 
75 
76 namespace scribo
77 {
78 
79  namespace text
80  {
81 
90  template <typename L>
91  void
92  recognition(line_set<L>& lines, const char *language);
93 
94 
105  template <typename I>
106  void
107  recognition(const Image<I>& line,
108  const char *language,
109  const std::string& output_file = std::string());
110 
111 
112 
113 # ifndef MLN_INCLUDE_ONLY
114 
115 # ifdef HAVE_TESSERACT_2
116 
117 
118  template <typename L>
119  void
120  recognition(line_set<L>& lines, const char *language)
121  {
122  using namespace mln;
123 
124  mln_trace("scribo::text::recognition");
125 
126  /* Tesseract is known to have issues while reading training
127  data, depending on the locale in use. Training data files
128  contain floating-point values and the decimal separator may
129  either be '.' or ',' which may trigger a fatal error with the
130  following message:
131 
132  Error: Illegal min or max specification!
133 
134  The recommended solution is to set the locale of LC_NUMERIC
135  to "C". For more information, see:
136 
137  http://code.google.com/p/tesseract-ocr/wiki/FAQ#Error:_Illegal_min_or_max_specification
138  */
139  setlocale(LC_NUMERIC, "C");
140 
141  // Initialize Tesseract.
142  TessBaseAPI::InitWithLanguage(NULL, NULL, language, NULL, false, 0, NULL);
143 
144  typedef mln_ch_value(L,bool) I;
145 
147  for_all_lines(i, lines)
148  {
149  if (! lines(i).is_textline())
150  continue;
151 
152  mln_domain(I) box = lines(i).bbox();
153 
154  // Make sure characters are isolated from the borders.
155  // Help Tesseract.
156  box.enlarge(2);
157 
158  I text_ima(box);
159  data::fill(text_ima, false);
160 
161  // Careful: background is set to 'false'.
162  const component_set<L>& comp_set = lines.components();
163  const L& lbl = comp_set.labeled_image();
164 
165  // Extract each character component to create the line image.
166  const mln::util::array<component_id_t>& comps =
167  lines(i).component_ids();
168  for_all_elements(e, lines(i).component_ids())
169  {
170  unsigned comp_id = comps(e);
171  data::fill(((text_ima | comp_set(comp_id).bbox()).rw()
172  | (pw::value(lbl) == pw::cst(comp_id))).rw(),
173  true);
174  }
175 
177  text::clean_inplace(lines(i), text_ima);
178 
179  // Make sure characters are isolated from the borders.
180  // Help Tesseract.
181  //
182  // FIXME: can be improved! We need a morpher for a constant
183  // extension set to false (avoid data::fill), a morpher for
184  // translating the domain to (0,0) (avoid the creation of a
185  // new image), change the default border::thickness to 0 and a
186  // morpher to enlarge the domain to a part of the extension.
187  mln_domain(I) lbox = text_ima.domain();
188  lbox.enlarge(lines(i).char_space() + 2);
189  I line_image(lbox, 0); // Make sure there is no border!
190  data::fill(line_image, false);
191  data::paste_without_localization(text_ima, line_image);
192 
193  // Recognize characters.
194  char* s = TessBaseAPI::TesseractRect(
195  (unsigned char*) line_image.buffer(),
196  sizeof (bool), // pixel size
197  line_image.ncols() * sizeof (bool), // row offset
198  0, // left
199  0, // top
200  line_image.ncols(), // n cols
201  line_image.nrows()); // n rows
202 
203  if (s != 0)
204  {
205  std::string str(s);
206  str = str.substr(0, str.length() - 2);
207  lines(i).update_text(str);
208  }
209 
210  // The string has been allocated by Tesseract. It must be released.
211  delete[] s;
212  }
213 
214  // Restore the default locale from the environment.
215  setlocale(LC_NUMERIC, "");
216  }
217 
218 
219  template <typename I>
220  void
221  recognition(const Image<I>& line_,
222  const char *language,
223  const std::string& output_file)
224  {
225  using namespace mln;
226 
227  mln_trace("scribo::text::recognition");
228 
229  const I& line = exact(line_);
230  mln_precondition(line.is_valid());
231 
232  // See the above explanations about setlocale.
233  setlocale(LC_NUMERIC, "C");
234 
235  // Initialize Tesseract.
236  TessBaseAPI::InitWithLanguage(NULL, NULL, language, NULL, false, 0, NULL);
237 
238  std::ofstream file;
239  if (!output_file.empty())
240  file.open(output_file.c_str());
241 
242  mln_domain(I) box = line.domain();
243  // Make sure characters are isolated from the borders.
244  // Help Tesseract.
245  box.enlarge(2);
246 
247  I text_ima(box);
248  data::fill(text_ima, false);
249  data::paste(line, text_ima);
250 
251  // Make sure there is no border.
252  border::resize(text_ima, 0);
253 
254  // Recognize characters.
255  char* s = TessBaseAPI::TesseractRect(
256  (unsigned char*) text_ima.buffer(),
257  sizeof (bool), // pixel size
258  text_ima.ncols() * sizeof (bool), // row offset
259  0, // left
260  0, // top
261  text_ima.ncols(), // n cols
262  text_ima.nrows()); // n rows
263 
264  if (s != 0)
265  {
266  if (!output_file.empty())
267  {
268  std::string str(s);
269  str = str.substr(0, str.length() - 1);
270  file << line.domain().bbox().pmin().row()
271  << " "
272  << line.domain().bbox().pmin().col()
273  << " "
274  << line.domain().bbox().pmax().row()
275  << " "
276  << line.domain().bbox().pmax().col()
277  << " "
278  << str;
279  }
280  }
281 
282  // The string has been allocated by Tesseract. We must free it.
283  delete[] s;
284 
285  if (!output_file.empty())
286  file.close();
287 
288  // See the above explanations about setlocale.
289  setlocale(LC_NUMERIC, "");
290  }
291 
292 
293 # else // HAVE_TESSERACT_3
294 
295 
296  template <typename L>
297  void
298  recognition(line_set<L>& lines, const char *language)
299  {
300  using namespace mln;
301 
302  mln_trace("scribo::text::recognition");
303 
304  // See the above explanations about setlocale.
305  setlocale(LC_NUMERIC, "C");
306 
307  // Initialize Tesseract.
308  tesseract::TessBaseAPI tess;
309  if (tess.Init(NULL, language, tesseract::OEM_DEFAULT) == -1)
310  {
311  std::cerr << "Error: cannot initialize tesseract!" << std::endl;
312  abort();
313  }
314  tess.SetPageSegMode(tesseract::PSM_SINGLE_LINE);
315 
316  typedef mln_ch_value(L,bool) I;
317 
318  // Use text bboxes with Tesseract.
319  for_all_lines(i, lines)
320  {
321  if (! lines(i).is_textline())
322  continue;
323 
324  mln_domain(I) box = lines(i).bbox();
325 
326  // Make sure characters are isolated from the borders.
327  // Help Tesseract.
328  // FIXME: not needed anymore in tesseract 3 ?
329  //
330  box.enlarge(2);
331 
332  I text_ima(box);
333  data::fill(text_ima, false);
334 
335  // Careful: background is set to 'false'.
336  const component_set<L>& comp_set = lines.components();
337  const L& lbl = comp_set.labeled_image();
338 
339  // Extract each character component to create the line image.
340  const mln::util::array<component_id_t>& comps =
341  lines(i).component_ids();
342  for_all_elements(e, lines(i).component_ids())
343  {
344  unsigned comp_id = comps(e);
345  data::fill(((text_ima | comp_set(comp_id).bbox()).rw()
346  | (pw::value(lbl) == pw::cst(comp_id))).rw(),
347  true);
348  }
349 
351  text::clean_inplace(lines(i), text_ima);
352 
353  // Recognize characters.
354  tess.SetImage(
355  (unsigned char*) &text_ima(text_ima.domain().pmin()),
356  text_ima.ncols(), // n cols
357  text_ima.nrows(), // n rows
358  sizeof (bool), // pixel size
359  text_ima.ncols() * sizeof (bool)
360  + 2 * text_ima.border()); // row offset
361 
362  char *s = tess.GetUTF8Text();
363  if (s != 0)
364  {
365  tesseract::ResultIterator *it = tess.GetIterator();
366  std::string str(s);
367  str = str.substr(0, str.length() - 2);
368  lines(i).update_text(str, it->Confidence(tesseract::RIL_TEXTLINE));
369  }
370 
371  delete[] s;
372  }
373 
374  // Restore the default locale from the environment.
375  setlocale(LC_NUMERIC, "");
376  }
377 
378 
379  template <typename I>
380  void
381  recognition(const Image<I>& line_,
382  const char *language,
383  const std::string& output_file)
384  {
385  using namespace mln;
386 
387  mln_trace("scribo::text::recognition");
388 
389  const I& line = exact(line_);
390  mln_precondition(line.is_valid());
391 
392  // Restore the default locale from the environment.
393  setlocale(LC_NUMERIC, "C");
394 
395  // Initialize Tesseract.
396  tesseract::TessBaseAPI tess;
397  if (tess.Init(NULL, language, tesseract::OEM_DEFAULT) == -1)
398  {
399  std::cerr << "Error: cannot initialize tesseract!" << std::endl;
400  abort();
401  }
402 
403  std::ofstream file;
404  if (!output_file.empty())
405  file.open(output_file.c_str());
406 
407  // Recognize characters.
408  char* s = tess.TesseractRect(
409  (unsigned char*) &line(line.domain().pmin()),
410  sizeof (bool), // pixel size
411  line.ncols() * sizeof (bool) + line.border() * 2, // row offset
412  0, // left
413  0, // top
414  line.ncols(), // n cols
415  line.nrows()); // n rows
416 
417  if (s != 0)
418  {
419  if (!output_file.empty())
420  {
421  std::string str(s);
422  str = str.substr(0, str.length() - 1);
423  file << line.domain().bbox().pmin().row()
424  << " "
425  << line.domain().bbox().pmin().col()
426  << " "
427  << line.domain().bbox().pmax().row()
428  << " "
429  << line.domain().bbox().pmax().col()
430  << " "
431  << str;
432  }
433  }
434 
435  // The string has been allocated by Tesseract. We must free it.
436  delete[] s;
437 
438  if (!output_file.empty())
439  file.close();
440 
441  // Restore the default locale from the environment.
442  setlocale(LC_NUMERIC, "");
443  }
444 
445 
446 # endif // ! HAVE_TESSERACT_2
447 
448 # endif // ! MLN_INCLUDE_ONLY
449 
450  } // end of namespace scribo::text
451 
452 } // end of namespace scribo
453 
454 #endif // ! SCRIBO_TEXT_RECOGNITION_HH