$extrastylesheet
Olena  User documentation 2.1
An Image Processing Platform
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
content_in_hdoc_functor.hh
1 // Copyright (C) 2010, 2011 EPITA Research and Development Laboratory
2 // (LRDE)
3 //
4 // This file is part of Olena.
5 //
6 // Olena is free software: you can redistribute it and/or modify it under
7 // the terms of the GNU General Public License as published by the Free
8 // Software Foundation, version 2 of the License.
9 //
10 // Olena is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 // General Public License for more details.
14 //
15 // You should have received a copy of the GNU General Public License
16 // along with Olena. If not, see <http://www.gnu.org/licenses/>.
17 //
18 // As a special exception, you may use this file as part of a free
19 // software project without restriction. Specifically, if other files
20 // instantiate templates or use macros or inline functions from this
21 // file, or you compile this file and link it with other files to produce
22 // an executable, this file does not by itself cause the resulting
23 // executable to be covered by the GNU General Public License. This
24 // exception does not however invalidate any other reasons why the
25 // executable file might be covered by the GNU General Public License.
26 
27 #ifndef SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_HDOC_FUNCTOR_HH
28 # define SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_HDOC_FUNCTOR_HH
29 
34 
35 
36 # ifndef SCRIBO_NDEBUG
37 # include <mln/util/timer.hh>
38 # endif // ! SCRIBO_NDEBUG
39 
40 # include <scribo/core/def/lbl_type.hh>
41 # include <scribo/core/document.hh>
42 # include <scribo/core/line_set.hh>
43 # include <scribo/core/paragraph_set.hh>
44 
45 # include <scribo/primitive/extract/non_text_hdoc.hh>
46 # include <scribo/primitive/extract/components.hh>
47 # include <scribo/primitive/extract/lines_h_thick_and_thin.hh>
48 
49 # include <scribo/primitive/extract/alignments.hh>
50 
51 # include <scribo/primitive/identify.hh>
52 
53 # include <scribo/primitive/remove/separators.hh>
54 
55 # include <scribo/preprocessing/rotate_90.hh>
56 
57 # include <scribo/filter/line_links_x_height.hh>
58 # include <scribo/filter/object_links_bbox_h_ratio.hh>
59 # include <scribo/filter/objects_small.hh>
60 # include <scribo/filter/paragraphs_bbox_overlap.hh>
61 # include <scribo/filter/paragraphs_in_image.hh>
62 # include <scribo/filter/paragraphs_in_borders.hh>
63 # include <scribo/filter/separators_in_element.hh>
64 # include <scribo/filter/separators_in_paragraph.hh>
65 # include <scribo/filter/separators_in_borders.hh>
66 # include <scribo/filter/images_in_paragraph.hh>
67 
68 # include <scribo/primitive/group/from_single_link.hh>
69 
70 # include <scribo/primitive/link/merge_double_link.hh>
71 # include <scribo/primitive/link/internal/dmax_width_and_height.hh>
72 # include <scribo/primitive/link/with_single_left_link_dmax_ratio.hh>
73 # include <scribo/primitive/link/with_single_right_link_dmax_ratio.hh>
74 
75 # include <scribo/preprocessing/denoise_fg.hh>
76 
77 # include <scribo/postprocessing/images_to_drop_capital.hh>
78 
79 # ifndef SCRIBO_NOCR
80 # include <scribo/text/recognition.hh>
81 # endif // ! SCRIBO_NOCR
82 
83 # include <scribo/text/merging_hdoc.hh>
84 # include <scribo/text/link_lines.hh>
85 # include <scribo/text/extract_paragraphs_hdoc.hh>
86 
87 # include <scribo/make/debug_filename.hh>
88 
89 # include <scribo/debug/decision_image.hh>
90 # include <scribo/debug/bboxes_image.hh>
91 # include <scribo/debug/linked_bboxes_image.hh>
92 # include <scribo/debug/bboxes_enlarged_image.hh>
93 # include <scribo/debug/mean_and_base_lines_image.hh>
94 # include <scribo/debug/looks_like_a_text_line_image.hh>
95 
96 # include <scribo/toolchain/internal/toolchain_functor.hh>
97 
98 # include <scribo/io/xml/save.hh>
99 
100 #include <scribo/io/img/save.hh>
101 
102 namespace scribo
103 {
104 
105  namespace toolchain
106  {
107 
108  namespace internal
109  {
110 
111 
115  template <typename I>
117  : public Toolchain_Functor
118  {
119  typedef scribo::def::lbl_type V;
120  typedef mln_ch_value(I,V) L;
121 
122  content_in_hdoc_functor(const char *doc_filename);
123 
124  virtual int nsteps() const;
125 
126  virtual void on_xml_saved();
127 
128  //===============
129  // Core function
130  //===============
131 
132  template <typename J>
133  scribo::document<L> operator()(const Image<J>& original_image,
134  const Image<I>& processed_image);
135 
136 
137  //=========
138  // Options
139  //=========
140  bool enable_denoising;
141  bool enable_line_seps;
142  bool enable_whitespace_seps;
143  bool enable_ocr;
144  bool save_doc_as_xml;
145  scribo::io::xml::Format xml_format;
146 
147  //============
148  // Parameters
149  //============
150 
151  std::string ocr_language;
152  std::string output_file;
153 
154  //=========
155  // Results
156  //=========
157  document<L> doc;
158 
159 
160 
161 # ifndef SCRIBO_NDEBUG
162  //=============
163  // DEBUG TOOLS
164  //=============
165  virtual void on_start();
166  virtual void on_end();
167  virtual void on_progress();
168 
170  mln::util::timer gt;
171 # endif // ! SCRIBO_NDEBUG
172  };
173 
174 
175 # ifndef MLN_INCLUDE_ONLY
176 
177  template <typename I>
179  : enable_denoising(true),
180  enable_line_seps(true),
181  enable_whitespace_seps(true),
182  enable_ocr(true),
183  save_doc_as_xml(false),
184  xml_format(scribo::io::xml::PageExtended),
185  ocr_language("eng"),
186  output_file("/tmp/foo.xml"),
187  doc(doc_filename)
188  {
189  }
190 
191 
192  //===============
193  // Core function
194  //===============
195 
196  template <typename I>
197  template <typename J>
199  content_in_hdoc_functor<I>::operator()(const Image<J>& original_image,
200  const Image<I>& processed_image)
201  {
202  mln_precondition(exact(original_image).is_valid());
203  mln_precondition(exact(processed_image).is_valid());
204 
205  on_start();
206 
207  doc.set_image(exact(original_image));
208  doc.set_binary_image(exact(processed_image));
209 
210  // Remove separators
211  mln_ch_value(I,bool)
212  separators,
213  input_cleaned = exact(processed_image);
214  if (enable_line_seps)
215  {
216  // FIXME: SLOW
217  on_new_progress_label("Find vertical and horizontal separators...");
218 
219  // Vertical and horizontal separators
220  {
221  unsigned closing_size = std::min(0.01 * doc.image().domain().width(),
222  0.01 * doc.image().domain().height());
223  win::hline2d hl(closing_size);
224 
225  // Apply a closing::structural in order to disconnected
226  // parts of a single separator.
227  mln_ch_value(I,bool)
228  vseparators = preprocessing::rotate_90(
229  morpho::closing::structural(
230  primitive::extract::lines_h_thick_and_thin(
231  preprocessing::rotate_90(processed_image),
232  101, 3, 0.2, 0.6, 10), hl), false),
233 
234  hseparators = morpho::closing::structural(
235  primitive::extract::lines_h_thick_and_thin(
236  processed_image, 101, 3), hl);
237 
238  doc.set_vline_separators(vseparators);
239  doc.set_hline_separators(hseparators);
240 
241  separators = duplicate(vseparators);
242  separators += hseparators;
243 
244  border::resize(processed_image, border::thickness);
245  }
246 
247  on_progress();
248 
249  on_new_progress_label("Remove separators...");
250 
251  input_cleaned = primitive::remove::separators(processed_image,
252  separators);
253 
254  on_progress();
255  }
256 
257 
258 # ifndef SCRIBO_NDEBUG
259  // Debug
260  if (enable_line_seps)
261  {
262  debug::logger().log_image(debug::AuxiliaryResults,
263  doc.vline_seps(),
264  "vseparators");
265 
266  debug::logger().log_image(debug::AuxiliaryResults,
267  doc.hline_seps(),
268  "hseparators");
269 
270  debug::logger().log_image(debug::AuxiliaryResults,
271  input_cleaned,
272  "input_wo_separators");
273  }
274 # endif // ! SCRIBO_NDEBUG
275 
276  unsigned min_area = std::min(0.005 * doc.image().domain().width(),
277  0.005 * doc.image().domain().height());
278 
279  // Denoise
280  if (enable_denoising)
281  {
282  on_new_progress_label("Denoise...");
283 
284  if (verbose)
285  std::cout << ">> min_area = " << min_area << std::endl;
286 
287  input_cleaned = preprocessing::denoise_fg(input_cleaned, c8(), min_area);
288 
289  // Debug
290 # ifndef SCRIBO_NDEBUG
291  debug::logger().log_image(debug::AuxiliaryResults,
292  input_cleaned, "denoised");
293 # endif // ! SCRIBO_NDEBUG
294 
295  on_progress();
296  }
297 
298  doc.set_binary_image_wo_seps(input_cleaned);
299 
301  on_new_progress_label("Finding components...");
302 
303  // NOTE: Component features computation is disabled.
304  V ncomponents;
305  component_set<L>
307  c8(),
308  ncomponents);
309 
310  on_progress();
311 
312 
314  if (enable_line_seps)
315  components.add_separators(separators);
316 
317  // Debug
318 # ifndef SCRIBO_NDEBUG
319  debug::logger().log_image(debug::AuxiliaryResults,
320  components.separators(),
321  "all_separators");
322 # endif // ! SCRIBO_NDEBUG
323 
324 
325  on_new_progress_label("Filtering components");
326 
327  components = scribo::filter::components_small(components, min_area);
328 
329  on_progress();
330 
331 
333  on_new_progress_label("Linking objects...");
334 
335  object_links<L> left_link
337  components,
338 // primitive::link::internal::dmax_width_and_height(1),
339  primitive::link::internal::dmax_default(1),
340  anchor::MassCenter);
341 
342 
343  object_links<L> right_link
345  components,
346 // primitive::link::internal::dmax_width_and_height(1),
347  primitive::link::internal::dmax_default(1),
348  anchor::MassCenter);
349 
350  // Debug
351 # ifndef SCRIBO_NDEBUG
352  if (debug::logger().is_enabled())
353  {
355  debug::AuxiliaryResults,
356  debug::linked_bboxes_image(processed_image,
357  left_link,
358  right_link,
359  literal::blue,
360  literal::cyan,
361  literal::yellow,
362  literal::green,
363  anchor::MassCenter),
364  "object_links");
365  }
366 # endif // ! SCRIBO_NDEBUG
367 
368  // Validating left and right links.
369  object_links<L>
370  merged_links = primitive::link::merge_double_link(left_link,
371  right_link);
372 
373  on_progress();
374 
375 
376  on_new_progress_label("Filtering objects");
377 
378  // Remove links if bboxes have too different sizes.
379  object_links<L> hratio_filtered_links
380  = filter::object_links_bbox_h_ratio(merged_links, 2.5f);
381 
382 
383 # ifndef SCRIBO_NDEBUG
384  if (debug::logger().is_enabled())
385  {
386  mln_ch_value(I,value::rgb8)
387  hratio_decision_image = scribo::debug::decision_image(processed_image,
388  merged_links,
389  hratio_filtered_links,
390  anchor::MassCenter);
391  // Debug
392  debug::logger().log_image(debug::AuxiliaryResults,
393  hratio_decision_image,
394  "hratio_links_decision_image");
395  }
396 # endif // ! SCRIBO_NDEBUG
397 
398  on_progress();
399 
400 
401  on_new_progress_label("Rebuilding lines");
402 
403  object_groups<L>
404  groups = primitive::group::from_single_link(hratio_filtered_links);
405 
406 
407 
408  // Construct a line set.
409  line_set<L>
410  lines = scribo::make::line_set(groups);
411 
412  on_progress();
413 
414 
415  // Extract whitespace to improve text merging results afterwards.
416  mln_ch_value(L,bool) whitespaces;
417  if (enable_whitespace_seps)
418  {
420  doc.set_paragraphs(parset);
421 
422  // Whitespace separators
423  on_new_progress_label("Find whitespace separators...");
424 
426  res = primitive::extract::alignments(doc, 3, 3);
427  whitespaces = res.second();
428 
429  on_progress();
430 
431  components.add_separators(res.second());
432  doc.set_whitespace_separators(res.second(), res.first());
433  }
434 
435 
436  //===== DEBUG =====
437 # ifndef SCRIBO_NDEBUG
438  if (debug::logger().is_enabled())
439  {
440  if (enable_whitespace_seps)
441  debug::logger().log_image(debug::AuxiliaryResults,
442  whitespaces, "whitespaces");
443 
444  // Bboxes image.
446  debug::AuxiliaryResults,
447  scribo::debug::bboxes_image(processed_image, lines),
448  "step1_bboxes");
449 
450  // Bboxes enlarged
452  debug::AuxiliaryResults,
453  scribo::debug::bboxes_enlarged_image(processed_image, lines),
454  "step1_bboxes_enlarged");
455 
456  // Looks like a text line
458  debug::AuxiliaryResults,
459  scribo::debug::looks_like_a_text_line_image(processed_image, lines),
460  "step1_looks_like_a_text_line");
461 
462  // mean and base lines.
464  debug::AuxiliaryResults,
465  scribo::debug::mean_and_base_lines_image(processed_image, lines),
466  "step1_x_height");
467  }
468 # endif // ! SCRIBO_NDEBUG
469  //===== END OF DEBUG =====
470 
471  on_new_progress_label("Merging segmented lines");
472 
473  lines = scribo::text::merging_hdoc(lines);
474 
475 
476  //===== DEBUG =====
477 # ifndef SCRIBO_NDEBUG
478  if (debug::logger().is_enabled())
479  {
480 
481  // mean and base lines.
483  debug::AuxiliaryResults,
484  scribo::debug::mean_and_base_lines_image(processed_image, lines),
485  "step2_x_height");
486 
487  // Looks like a text line
489  debug::AuxiliaryResults,
490  scribo::debug::looks_like_a_text_line_image(processed_image, lines),
491  "step2_looks_like_a_text_line");
492 
493  // Bboxes image.
495  debug::AuxiliaryResults,
496  scribo::debug::bboxes_image(processed_image, lines),
497  "step2_bboxes");
498  }
499 # endif // ! SCRIBO_NDEBUG
500  //===== END OF DEBUG =====
501 
502  on_progress();
503 
504 
505  //===== DEBUG =====
506 # ifndef SCRIBO_NDEBUG
507  {
508  image2d<bool> tmp = duplicate(input_cleaned);
509  for_all_lines(l, lines)
510  if (lines(l).is_textline())
511  mln::draw::box_plain(tmp, lines(l).bbox(), false);
512 
513  debug::logger().log_image(
514  debug::AuxiliaryResults,
515  tmp,
516  "input_wo_text");
517  }
518 # endif // ! SCRIBO_NDEBUG
519  //===== END OF DEBUG =====
520 
521 
522 # ifndef SCRIBO_NOCR
523  // Text recognition
524  if (enable_ocr)
525  {
526  on_new_progress_label("Recognizing text");
527 
528  scribo::text::recognition(lines, ocr_language.c_str());
529 
530  on_progress();
531  }
532 # endif // ! SCRIBO_NOCR
533 
534  on_new_progress_label("Extracting paragraphs");
535 
538  doc.binary_image());
539 
540  on_progress();
541 
542  on_new_progress_label("Filtering paragraphs");
543 
544  // paragraph_set<L> parset_f = filter::paragraphs_bbox_overlap(parset);
545  // doc.set_paragraphs(parset_f);
546 
547  parset = filter::paragraphs_bbox_overlap(parset);
548  doc.set_paragraphs(parset);
549 
550  on_progress();
551 
552 
553  // Extract other Elements
554  on_new_progress_label("Extracting Elements");
555 
556  unsigned closing_size = std::min(0.01 * doc.image().domain().width(),
557  0.01 * doc.image().domain().height());
558  if (!(closing_size % 2))
559  closing_size += 1;
560 
561  if (verbose)
562  std::cout << ">> CLosing size = " << closing_size << std::endl;
563 
564  component_set<L>
565  elements = scribo::primitive::extract::non_text_hdoc(doc, closing_size);
566 
567  on_progress();
568 
569 
570  // Identify other Elements
571  on_new_progress_label("Identifying Elements");
572  elements = scribo::primitive::identify(elements);
573 
574  doc.set_elements(elements);
575 
576  on_progress();
577 
578 // // TEMPORARY DEBUG
579 // on_new_progress_label("Saving debug data");
580 // doc.set_paragraphs(parset);
581 // scribo::io::img::save(doc, "debug_wo_filter.png", scribo::io::img::DebugWoImage);
582 // scribo::io::img::save(doc, "full_wo_filter.png", scribo::io::img::DebugWithImage);
583 // doc.set_paragraphs(parset_f);
584 // on_progress();
585 // // END OF TEMPORARY DEBUG
586 
587  on_new_progress_label("Cleanup miscellaneous false positive");
588 
590  filter::separators_in_paragraph(doc, 81, 121);
591  filter::separators_in_borders(doc, 0.05, 0.02);
592 
595 
596  on_progress();
597 
598  on_new_progress_label("Rebuild extracted images");
599 
600  elements = scribo::primitive::extract::non_text_hdoc(doc, closing_size);
601  doc.set_elements(elements);
602 
603  on_progress();
604 
605  on_new_progress_label("Tag images as drop capital");
606 
608 
609  on_progress();
610 
611  // Saving results
612  if (save_doc_as_xml)
613  {
614  on_new_progress_label("Saving results");
615 
616  scribo::io::xml::save(doc, output_file, xml_format);
617  on_xml_saved();
618 
619  on_progress();
620  }
621 
622  on_end();
623 
624 
625  return doc;
626  }
627 
628 
629 
630  template<typename I>
631  int
632  content_in_hdoc_functor<I>::nsteps() const
633  {
634  return 10 + enable_denoising + enable_line_seps
635  + enable_whitespace_seps + enable_ocr + save_doc_as_xml;
636  }
637 
638 
639  template<typename I>
640  void
641  content_in_hdoc_functor<I>::on_xml_saved()
642  {
643  // Nothing
644  }
645 
646 # ifndef SCRIBO_NDEBUG
647 
648  template <typename I>
649  void
650  content_in_hdoc_functor<I>::on_start()
651  {
652  gt.start();
653  t.start();
654  }
655 
656  template <typename I>
657  void
658  content_in_hdoc_functor<I>::on_end()
659  {
660  gt.stop();
661  if (verbose)
662  std::cout << "Total time: " << gt << std::endl;
663  }
664 
665  template <typename I>
666  void
667  content_in_hdoc_functor<I>::on_progress()
668  {
669  t.stop();
670  if (verbose)
671  std::cout << t << std::endl;
672  t.restart();
673  }
674 
675 
676 # endif // ! SCRIBO_NDEBUG
677 
678 
679 # endif // ! MLN_INCLUDE_ONLY
680 
681 
682  } // end of namespace scribo::toolchain::internal
683 
684  } // end of namespace scribo::toolchain
685 
686 } // end of namespace scribo
687 
688 #endif // ! SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_HDOC_FUNCTOR_HH