27 #ifndef SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_DOC_FUNCTOR_HH
28 # define SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_DOC_FUNCTOR_HH
34 # ifndef SCRIBO_NDEBUG
35 # include <mln/util/timer.hh>
36 # endif // ! SCRIBO_NDEBUG
38 # include <scribo/core/def/lbl_type.hh>
39 # include <scribo/core/document.hh>
40 # include <scribo/core/line_set.hh>
41 # include <scribo/core/paragraph_set.hh>
43 # include <scribo/primitive/extract/non_text.hh>
44 # include <scribo/primitive/extract/components.hh>
45 # include <scribo/primitive/extract/separators.hh>
46 # include <scribo/primitive/extract/vertical_separators.hh>
47 # include <scribo/primitive/extract/horizontal_separators.hh>
49 # include <scribo/primitive/extract/alignments.hh>
51 # include <scribo/primitive/identify.hh>
53 # include <scribo/primitive/remove/separators.hh>
55 # include <scribo/filter/line_links_x_height.hh>
56 # include <scribo/filter/object_links_bbox_h_ratio.hh>
57 # include <scribo/filter/objects_small.hh>
59 # include <scribo/primitive/group/from_single_link.hh>
61 # include <scribo/primitive/link/merge_double_link.hh>
62 # include <scribo/primitive/link/internal/dmax_width_and_height.hh>
63 # include <scribo/primitive/link/with_single_left_link_dmax_ratio.hh>
64 # include <scribo/primitive/link/with_single_right_link_dmax_ratio.hh>
66 # include <scribo/preprocessing/denoise_fg.hh>
69 # include <scribo/text/recognition.hh>
70 # endif // ! SCRIBO_NOCR
72 # include <scribo/text/merging.hh>
73 # include <scribo/text/link_lines.hh>
74 # include <scribo/text/extract_paragraphs.hh>
76 # include <scribo/make/debug_filename.hh>
78 # include <scribo/debug/decision_image.hh>
79 # include <scribo/debug/bboxes_image.hh>
80 # include <scribo/debug/linked_bboxes_image.hh>
81 # include <scribo/debug/bboxes_enlarged_image.hh>
82 # include <scribo/debug/mean_and_base_lines_image.hh>
83 # include <scribo/debug/looks_like_a_text_line_image.hh>
85 # include <scribo/toolchain/internal/toolchain_functor.hh>
87 # include <scribo/io/xml/save.hh>
102 template <
typename I>
111 virtual int nsteps()
const;
113 virtual void on_xml_saved();
119 template <
typename J>
127 bool enable_denoising;
128 bool enable_line_seps;
129 bool enable_whitespace_seps;
131 bool save_doc_as_xml;
132 scribo::io::xml::Format xml_format;
138 std::string ocr_language;
139 std::string output_file;
146 # ifndef SCRIBO_NDEBUG
150 virtual void on_start();
151 virtual void on_end();
152 virtual void on_progress();
156 # endif // ! SCRIBO_NDEBUG
161 # ifndef MLN_INCLUDE_ONLY
163 template <
typename I>
165 : enable_denoising(true),
166 enable_line_seps(true),
167 enable_whitespace_seps(true),
169 save_doc_as_xml(false),
170 xml_format(scribo::io::xml::PageExtended),
172 output_file(
"/tmp/foo.xml"),
182 template <
typename I>
183 template <
typename J>
185 content_in_doc_functor<I>::operator()(
const Image<J>& original_image,
188 mln_precondition(
exact(original_image).is_valid());
189 mln_precondition(
exact(processed_image).is_valid());
193 doc.set_image(
exact(original_image));
194 doc.set_binary_image(
exact(processed_image));
199 input_cleaned = exact(processed_image);
200 if (enable_line_seps)
203 on_new_progress_label(
"Find vertical and horizontal separators...");
211 doc.set_vline_separators(vseparators);
212 doc.set_hline_separators(hseparators);
214 separators = vseparators;
215 separators += hseparators;
222 on_new_progress_label("Remove separators...");
224 input_cleaned = primitive::remove::separators(processed_image,
231 # ifndef SCRIBO_NDEBUG
233 if (enable_line_seps)
245 "input_wo_separators");
247 # endif // ! SCRIBO_NDEBUG
251 if (enable_denoising)
253 on_new_progress_label(
"Denoise...");
258 # ifndef SCRIBO_NDEBUG
260 input_cleaned,
"denoised");
261 # endif // ! SCRIBO_NDEBUG
267 on_new_progress_label(
"Finding components...");
279 if (enable_line_seps)
280 components.add_separators(separators);
282 on_new_progress_label(
"Filtering components");
290 on_new_progress_label(
"Linking objects...");
292 object_links<L> left_link
295 primitive::link::internal::dmax_default(1),
299 object_links<L> right_link
302 primitive::link::internal::dmax_default(1),
306 # ifndef SCRIBO_NDEBUG
310 debug::AuxiliaryResults,
321 # endif // ! SCRIBO_NDEBUG
331 on_new_progress_label(
"Filtering objects");
334 object_links<L> hratio_filtered_links
338 # ifndef SCRIBO_NDEBUG
342 hratio_decision_image = scribo::debug::
decision_image(processed_image,
344 hratio_filtered_links,
347 debug::
logger().log_image(debug::AuxiliaryResults,
348 hratio_decision_image,
349 "hratio_links_decision_image");
351 # endif // ! SCRIBO_NDEBUG
356 on_new_progress_label(
"Rebuilding lines");
365 lines = scribo::make::line_set(groups);
370 if (enable_whitespace_seps)
373 doc.set_paragraphs(parset);
376 on_new_progress_label(
"Find whitespace separators...");
380 whitespaces = res.
second();
384 components.add_separators(res.
second());
385 doc.set_whitespace_separators(res.
second(), res.
first());
390 # ifndef SCRIBO_NDEBUG
392 components.separators(),
397 if (enable_whitespace_seps)
399 whitespaces,
"whitespaces");
403 debug::AuxiliaryResults,
409 debug::AuxiliaryResults,
411 "step1_bboxes_enlarged");
415 debug::AuxiliaryResults,
417 "step1_looks_like_a_text_line");
421 debug::AuxiliaryResults,
425 # endif // ! SCRIBO_NDEBUG
434 # ifndef SCRIBO_NDEBUG
440 debug::AuxiliaryResults,
446 debug::AuxiliaryResults,
448 "step2_looks_like_a_text_line");
452 debug::AuxiliaryResults,
456 # endif // ! SCRIBO_NDEBUG
466 on_new_progress_label(
"Recognizing text");
472 # endif // ! SCRIBO_NOCR
476 doc.set_paragraphs(parset);
482 on_new_progress_label(
"Extracting Elements");
490 on_new_progress_label(
"Identifying Elements");
491 elements = scribo::primitive::identify(elements);
492 doc.set_elements(elements);
501 on_new_progress_label(
"Saving results");
518 content_in_doc_functor<I>::nsteps()
const
520 return 10 + enable_denoising + enable_line_seps
521 + enable_whitespace_seps + enable_ocr + save_doc_as_xml;
527 content_in_doc_functor<I>::on_xml_saved()
532 # ifndef SCRIBO_NDEBUG
534 template <
typename I>
536 content_in_doc_functor<I>::on_start()
542 template <
typename I>
544 content_in_doc_functor<I>::on_end()
548 std::cout <<
"Total time: " << gt << std::endl;
551 template <
typename I>
553 content_in_doc_functor<I>::on_progress()
557 std::cout << t << std::endl;
562 # endif // ! SCRIBO_NDEBUG
564 # endif // ! MLN_INCLUDE_ONLY
572 #endif // ! SCRIBO_TOOLCHAIN_INTERNAL_CONTENT_IN_DOC_FUNCTOR_HH