27 #ifndef SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_HDOC_FUNCTOR_HH
28 # define SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_HDOC_FUNCTOR_HH
36 # ifndef SCRIBO_NDEBUG
37 # include <mln/util/timer.hh>
38 # endif // ! SCRIBO_NDEBUG
40 # include <scribo/core/def/lbl_type.hh>
41 # include <scribo/core/document.hh>
42 # include <scribo/core/line_set.hh>
43 # include <scribo/core/paragraph_set.hh>
45 # include <scribo/primitive/extract/non_text_hdoc.hh>
46 # include <scribo/primitive/extract/components.hh>
47 # include <scribo/primitive/extract/lines_h_thick_and_thin.hh>
49 # include <scribo/primitive/extract/alignments.hh>
51 # include <scribo/primitive/identify.hh>
53 # include <scribo/primitive/remove/separators.hh>
55 # include <scribo/preprocessing/rotate_90.hh>
57 # include <scribo/filter/line_links_x_height.hh>
58 # include <scribo/filter/object_links_bbox_h_ratio.hh>
59 # include <scribo/filter/objects_small.hh>
60 # include <scribo/filter/paragraphs_bbox_overlap.hh>
61 # include <scribo/filter/paragraphs_in_image.hh>
62 # include <scribo/filter/paragraphs_in_borders.hh>
63 # include <scribo/filter/separators_in_element.hh>
64 # include <scribo/filter/separators_in_paragraph.hh>
65 # include <scribo/filter/separators_in_borders.hh>
66 # include <scribo/filter/images_in_paragraph.hh>
68 # include <scribo/primitive/group/from_single_link.hh>
70 # include <scribo/primitive/link/merge_double_link.hh>
71 # include <scribo/primitive/link/internal/dmax_width_and_height.hh>
72 # include <scribo/primitive/link/with_single_left_link_dmax_ratio.hh>
73 # include <scribo/primitive/link/with_single_right_link_dmax_ratio.hh>
75 # include <scribo/preprocessing/denoise_fg.hh>
77 # include <scribo/postprocessing/images_to_drop_capital.hh>
80 # include <scribo/text/recognition.hh>
81 # endif // ! SCRIBO_NOCR
83 # include <scribo/text/merging_hdoc.hh>
84 # include <scribo/text/link_lines.hh>
85 # include <scribo/text/extract_paragraphs_hdoc.hh>
87 # include <scribo/make/debug_filename.hh>
89 # include <scribo/debug/decision_image.hh>
90 # include <scribo/debug/bboxes_image.hh>
91 # include <scribo/debug/linked_bboxes_image.hh>
92 # include <scribo/debug/bboxes_enlarged_image.hh>
93 # include <scribo/debug/mean_and_base_lines_image.hh>
94 # include <scribo/debug/looks_like_a_text_line_image.hh>
96 # include <scribo/toolchain/internal/toolchain_functor.hh>
98 # include <scribo/io/xml/save.hh>
100 #include <scribo/io/img/save.hh>
115 template <
typename I>
124 virtual int nsteps()
const;
126 virtual void on_xml_saved();
132 template <
typename J>
140 bool enable_denoising;
141 bool enable_line_seps;
142 bool enable_whitespace_seps;
144 bool save_doc_as_xml;
145 scribo::io::xml::Format xml_format;
151 std::string ocr_language;
152 std::string output_file;
161 # ifndef SCRIBO_NDEBUG
165 virtual void on_start();
166 virtual void on_end();
167 virtual void on_progress();
171 # endif // ! SCRIBO_NDEBUG
175 # ifndef MLN_INCLUDE_ONLY
177 template <
typename I>
179 : enable_denoising(true),
180 enable_line_seps(true),
181 enable_whitespace_seps(true),
183 save_doc_as_xml(false),
184 xml_format(scribo::io::xml::PageExtended),
186 output_file(
"/tmp/foo.xml"),
196 template <
typename I>
197 template <
typename J>
199 content_in_hdoc_functor<I>::operator()(
const Image<J>& original_image,
202 mln_precondition(
exact(original_image).is_valid());
203 mln_precondition(
exact(processed_image).is_valid());
207 doc.set_image(
exact(original_image));
208 doc.set_binary_image(
exact(processed_image));
213 input_cleaned = exact(processed_image);
214 if (enable_line_seps)
217 on_new_progress_label(
"Find vertical and horizontal separators...");
221 unsigned closing_size =
std::min(0.01 * doc.image().domain().width(),
222 0.01 * doc.image().domain().height());
228 vseparators = preprocessing::rotate_90(
231 preprocessing::rotate_90(processed_image),
232 101, 3, 0.2, 0.6, 10), hl), false),
236 processed_image, 101, 3), hl);
238 doc.set_vline_separators(vseparators);
239 doc.set_hline_separators(hseparators);
242 separators += hseparators;
249 on_new_progress_label("Remove separators...");
251 input_cleaned = primitive::remove::separators(processed_image,
258 # ifndef SCRIBO_NDEBUG
260 if (enable_line_seps)
272 "input_wo_separators");
274 # endif // ! SCRIBO_NDEBUG
276 unsigned min_area =
std::min(0.005 * doc.image().domain().width(),
277 0.005 * doc.image().domain().height());
280 if (enable_denoising)
282 on_new_progress_label(
"Denoise...");
285 std::cout <<
">> min_area = " << min_area << std::endl;
290 # ifndef SCRIBO_NDEBUG
292 input_cleaned,
"denoised");
293 # endif // ! SCRIBO_NDEBUG
298 doc.set_binary_image_wo_seps(input_cleaned);
301 on_new_progress_label(
"Finding components...");
314 if (enable_line_seps)
315 components.add_separators(separators);
318 # ifndef SCRIBO_NDEBUG
320 components.separators(),
322 # endif // ! SCRIBO_NDEBUG
325 on_new_progress_label(
"Filtering components");
333 on_new_progress_label(
"Linking objects...");
335 object_links<L> left_link
339 primitive::link::internal::dmax_default(1),
343 object_links<L> right_link
347 primitive::link::internal::dmax_default(1),
351 # ifndef SCRIBO_NDEBUG
355 debug::AuxiliaryResults,
366 # endif // ! SCRIBO_NDEBUG
376 on_new_progress_label(
"Filtering objects");
379 object_links<L> hratio_filtered_links
383 # ifndef SCRIBO_NDEBUG
387 hratio_decision_image = scribo::debug::
decision_image(processed_image,
389 hratio_filtered_links,
392 debug::
logger().log_image(debug::AuxiliaryResults,
393 hratio_decision_image,
394 "hratio_links_decision_image");
396 # endif // ! SCRIBO_NDEBUG
401 on_new_progress_label(
"Rebuilding lines");
410 lines = scribo::make::line_set(groups);
417 if (enable_whitespace_seps)
420 doc.set_paragraphs(parset);
423 on_new_progress_label(
"Find whitespace separators...");
427 whitespaces = res.
second();
431 components.add_separators(res.
second());
432 doc.set_whitespace_separators(res.
second(), res.
first());
437 # ifndef SCRIBO_NDEBUG
440 if (enable_whitespace_seps)
442 whitespaces,
"whitespaces");
446 debug::AuxiliaryResults,
452 debug::AuxiliaryResults,
454 "step1_bboxes_enlarged");
458 debug::AuxiliaryResults,
460 "step1_looks_like_a_text_line");
464 debug::AuxiliaryResults,
468 # endif // ! SCRIBO_NDEBUG
471 on_new_progress_label(
"Merging segmented lines");
477 # ifndef SCRIBO_NDEBUG
483 debug::AuxiliaryResults,
489 debug::AuxiliaryResults,
491 "step2_looks_like_a_text_line");
495 debug::AuxiliaryResults,
499 # endif // ! SCRIBO_NDEBUG
506 # ifndef SCRIBO_NDEBUG
509 for_all_lines(l, lines)
510 if (lines(l).is_textline())
513 debug::
logger().log_image(
514 debug::AuxiliaryResults,
518 # endif // ! SCRIBO_NDEBUG
526 on_new_progress_label(
"Recognizing text");
532 # endif // ! SCRIBO_NOCR
534 on_new_progress_label(
"Extracting paragraphs");
542 on_new_progress_label(
"Filtering paragraphs");
548 doc.set_paragraphs(parset);
554 on_new_progress_label(
"Extracting Elements");
556 unsigned closing_size =
std::min(0.01 * doc.image().domain().width(),
557 0.01 * doc.image().domain().height());
558 if (!(closing_size % 2))
562 std::cout <<
">> CLosing size = " << closing_size << std::endl;
571 on_new_progress_label(
"Identifying Elements");
572 elements = scribo::primitive::identify(elements);
574 doc.set_elements(elements);
587 on_new_progress_label(
"Cleanup miscellaneous false positive");
598 on_new_progress_label(
"Rebuild extracted images");
601 doc.set_elements(elements);
605 on_new_progress_label(
"Tag images as drop capital");
614 on_new_progress_label(
"Saving results");
632 content_in_hdoc_functor<I>::nsteps()
const
634 return 10 + enable_denoising + enable_line_seps
635 + enable_whitespace_seps + enable_ocr + save_doc_as_xml;
641 content_in_hdoc_functor<I>::on_xml_saved()
646 # ifndef SCRIBO_NDEBUG
648 template <
typename I>
650 content_in_hdoc_functor<I>::on_start()
656 template <
typename I>
658 content_in_hdoc_functor<I>::on_end()
662 std::cout <<
"Total time: " << gt << std::endl;
665 template <
typename I>
667 content_in_hdoc_functor<I>::on_progress()
671 std::cout << t << std::endl;
676 # endif // ! SCRIBO_NDEBUG
679 # endif // ! MLN_INCLUDE_ONLY
688 #endif // ! SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_HDOC_FUNCTOR_HH