32 #ifndef SCRIBO_TEXT_EXTRACT_PARAGRAPHS_HDOC_HH
33 # define SCRIBO_TEXT_EXTRACT_PARAGRAPHS_HDOC_HH
35 #include <mln/util/array.hh>
36 #include <mln/accu/shape/bbox.hh>
37 #include <mln/core/image/image2d.hh>
38 #include <mln/core/alias/neighb2d.hh>
39 #include <mln/draw/box.hh>
40 #include <mln/data/convert.hh>
41 #include <mln/value/int_u16.hh>
42 #include <mln/value/label_16.hh>
43 #include <mln/value/int_u8.hh>
44 #include <mln/value/rgb8.hh>
45 #include <mln/io/ppm/save.hh>
46 #include <mln/io/pgm/save.hh>
47 #include <mln/geom/rotate.hh>
48 #include <mln/literal/colors.hh>
50 #include <scribo/core/def/lbl_type.hh>
51 #include <scribo/core/macros.hh>
52 #include <scribo/core/line_set.hh>
53 #include <scribo/core/line_links.hh>
54 #include <scribo/core/line_info.hh>
55 #include <scribo/core/paragraph_set.hh>
76 # ifndef MLN_INCLUDE_ONLY
84 between_horizontal_separator(
const line_set<L>& lines,
88 const line_info<L>&
l1 = lines(l1_);
89 const line_info<L>&
l2 = lines(l2_);
92 mln_precondition(lines.components().has_separators());
98 row1 = l1_bbox.pcenter().row(),
105 unsigned left_col_ptr;
106 unsigned right_col_ptr;
111 row1 = l1_bbox.pmax().row();
114 const unsigned quarter =
115 ((l1_bbox.pcenter().col() - l1_bbox.pmin().col()) >> 2);
117 row = l1_bbox.pcenter().row();
118 col_ptr = l1_bbox.pcenter().col();
119 left_col_ptr = l1_bbox.pmin().col() + quarter;
120 right_col_ptr = l1_bbox.pmax().col() - quarter;
126 row1 = l1_bbox.pmin().row();
128 const unsigned quarter =
133 left_col_ptr = l2_bbox.
pmin().
col() + quarter;
134 right_col_ptr = l2_bbox.
pmax().
col() - quarter;
152 template <
typename L>
153 bool may_have_another_left_link(
const line_links<L>&
right,
154 const line_id_t& index,
155 const line_id_t& current_line,
156 const line_set<L>& lines)
158 const line_info<L>& l = lines(current_line);
159 const point2d& pmin = l.bbox().pmin();
160 const unsigned x1 = l.x_height();
162 for_all_links(i, right)
163 if (i != index && right(i) == index)
165 const line_info<L>& l_info = lines(i);
166 const unsigned x2 = l_info.x_height();
168 const float delta_max = 0.5f *
std::min(x1, x2);
170 if (l_info.bbox().pmin().col() < pmin.
col()
171 &&
std::abs(l.baseline() - l_info.baseline()) < delta_max
184 template <
typename L>
186 void paragraph_links(
const line_links<L>&
left,
187 line_links<L>& right,
188 line_links<L>& output,
189 const line_set<L>& lines)
191 output = left.duplicate();
196 for_all_lines(l, lines)
197 if (lines(l).is_textline())
201 line_id_t left_nbh = output(l);
202 line_id_t right_nbh =
right(l);
203 line_id_t lol_nbh = output(left_nbh);
209 if (lines.components().has_separators() &&
210 between_horizontal_separator(lines, right_nbh, l))
212 output(right_nbh) = right_nbh;
215 if (lines.components().has_separators() &&
216 between_horizontal_separator(lines, l, left_nbh))
224 const float x_height = lines(l).x_height();
225 const float left_x_height = lines(left_nbh).x_height();
226 const float right_x_height = lines(right_nbh).x_height();
228 const box2d& left_line_bbox = lines(left_nbh).
bbox();
229 const box2d& current_line_bbox = lines(l).
bbox();
230 const box2d& right_line_bbox = lines(right_nbh).
bbox();
231 const box2d& lol_line_bbox = lines(lol_nbh).
bbox();
233 const int lline_col_min = left_line_bbox.
pmin().
col();
234 const int cline_col_min = current_line_bbox.
pmin().
col();
235 const int rline_col_min = right_line_bbox.
pmin().
col();
236 const int lolline_col_min = lol_line_bbox.
pmin().
col();
238 const int lline_col_max = left_line_bbox.
pmax().
col();
239 const int cline_col_max = current_line_bbox.
pmax().
col();
240 const int rline_col_max = right_line_bbox.
pmax().
col();
242 const int lline_cw = lines(left_nbh).char_width();
243 const int cline_cw = lines(l).char_width();
244 const int rline_cw = lines(right_nbh).char_width();
246 const int delta_alignment = cline_cw;
252 const int c_baseline = lines(l).baseline();
255 const int lc_baseline = lines(left_nbh).baseline() - c_baseline;
256 const int rc_baseline = c_baseline -lines(right_nbh).baseline();
264 bool two_lines =
false;
267 if (lc_baseline == 0)
270 const line_id_t ror_nbh =
right(right_nbh);
274 if (ror_nbh != right_nbh
275 && output(ror_nbh) == right_nbh)
278 const float right_distance = lines(l).meanline() - lines(right_nbh).baseline();
280 const float ror_distance = lines(right_nbh).meanline() - lines(ror_nbh).baseline();
282 const float ror_x_height = lines(ror_nbh).x_height();
286 if (right_distance > 1.4f * ror_distance
287 &&
std::max(ror_x_height, right_x_height) <
288 1.4f *
std::min(ror_x_height, right_x_height)
289 && output(right_nbh) == l)
291 output(right_nbh) = right_nbh;
299 const float distance = lines(l).meanline() - lines(right_nbh).baseline();
305 if (distance > 2.0f *
std::min(x_height, right_x_height)
306 && output(right_nbh) == l)
308 output(right_nbh) = right_nbh;
314 const float min_x_height =
std::min(x_height, right_x_height);
315 const float max_x_height =
std::max(x_height, right_x_height);
316 const float min_char_width =
std::min(rline_cw, cline_cw);
317 const float max_char_width =
std::max(rline_cw, cline_cw);
321 if ((max_x_height > min_x_height * 1.4f) &&
322 !(max_char_width <= 1.2f * min_char_width))
324 if (output(right_nbh) == l)
326 output(right_nbh) = right_nbh;
336 else if (rc_baseline == 0)
341 if (lol_nbh != left_nbh)
344 const float left_distance = lines(left_nbh).meanline() -
348 const float lol_distance = lines(lol_nbh).meanline() -
349 lines(left_nbh).baseline();
351 const float lol_x_height = lines(lol_nbh).x_height();
355 if (left_distance > 1.4f * lol_distance
356 &&
std::max(lol_x_height, left_x_height) <
357 1.4f *
std::min(lol_x_height, left_x_height))
367 const float distance = lines(left_nbh).meanline() -
374 if (distance > 2.0f *
std::min(x_height, left_x_height))
382 const float min_x_height =
std::min(x_height, left_x_height);
383 const float max_x_height =
std::max(x_height, left_x_height);
384 const float min_char_width =
std::min(lline_cw, cline_cw);
385 const float max_char_width =
std::max(lline_cw, cline_cw);
389 if ((max_x_height > min_x_height * 1.4f) &&
390 !(max_char_width <= 1.2f * min_char_width))
412 if (left_distance > 1.5f * right_distance
413 &&
std::max(x_height, left_x_height) > 1.2f *
std::min(x_height, left_x_height))
420 else if (right_distance > 1.5f * left_distance
421 &&
std::max(x_height, right_x_height) >= 1.2f *
std::min(x_height, right_x_height)
422 && output(right_nbh) == l)
424 output(right_nbh) = right_nbh;
433 if (lc_baseline > rc_baseline)
435 const float cw_max =
std::max(lline_cw, cline_cw);
436 const float cw_min =
std::min(lline_cw, cline_cw);
437 const float min_x_height =
std::min(x_height, left_x_height);
438 const float max_x_height =
std::max(x_height, left_x_height);
440 if ((max_x_height > min_x_height * 1.4f) &&
441 !(cw_max <= 1.2f * cw_min))
448 const float min_x_height =
std::min(x_height, right_x_height);
449 const float max_x_height =
std::max(x_height, right_x_height);
450 const float cw_max =
std::max(rline_cw, cline_cw);
451 const float cw_min =
std::min(rline_cw, cline_cw);
453 if ((max_x_height > min_x_height * 1.4f)
454 && !(cw_max <= 1.2f * cw_min)
455 && output(right_nbh) == l)
457 output(right_nbh) = right_nbh;
464 const float cw_max =
std::max(rline_cw, cline_cw);
465 const float cw_min =
std::min(rline_cw, cline_cw);
466 const float min_x_height =
std::min(x_height, right_x_height);
467 const float max_x_height =
std::max(x_height, right_x_height);
469 if ((max_x_height > min_x_height * 1.4f)
470 && !(cw_max <= 1.2f * cw_min)
471 && output(right_nbh) == l)
473 output(right_nbh) = right_nbh;
478 const float min_x_height =
std::min(x_height, left_x_height);
479 const float max_x_height =
std::max(x_height, left_x_height);
480 const float cw_max =
std::max(lline_cw, cline_cw);
481 const float cw_min =
std::min(lline_cw, cline_cw);
483 if ((max_x_height > min_x_height * 1.4f)
484 && !(cw_max <= 1.2f * cw_min))
516 bool left_right_aligned =
false;
517 bool left_lol_aligned =
false;
518 const int dx_lr =
std::abs(lline_col_min - rline_col_min);
519 const int dx_llol =
std::abs(lline_col_min - lolline_col_min);
521 if (dx_lr < delta_alignment)
522 left_right_aligned =
true;
524 if (dx_llol < delta_alignment)
525 left_lol_aligned =
true;
527 if (left_right_aligned && left_lol_aligned)
529 const int left_right_col_min =
std::min(lline_col_min, rline_col_min);
530 const int dx_lrc =
std::abs(left_right_col_min - cline_col_min);
531 const float l_char_width = 1.5f * lines(l).char_width();
533 if (dx_lrc > l_char_width &&
534 dx_lrc < 3.0f * l_char_width &&
535 cline_col_min > rline_col_min &&
536 cline_col_min > lline_col_min)
538 const line_id_t out_right_nbh = output(right_nbh);
540 if (out_right_nbh != l)
542 if (output(out_right_nbh) == l)
543 output(out_right_nbh) = out_right_nbh;
547 output(right_nbh) = right_nbh;
569 bool left_right_max_aligned =
false;
570 bool left_current_min_aligned =
false;
571 bool lol_current_min_aligned =
false;
572 const bool lol_is_left = output(left_nbh) == left_nbh;
573 const int dx_lr_max =
std::abs(lline_col_max - rline_col_max);
574 const int dx_lc_min =
std::abs(lline_col_min - cline_col_min);
575 const int dx_lolc_min =
std::abs(lolline_col_min - cline_col_min);
577 if (dx_lr_max < delta_alignment)
578 left_right_max_aligned =
true;
580 if (dx_lc_min < delta_alignment)
581 left_current_min_aligned =
true;
583 if (dx_lolc_min < delta_alignment)
584 lol_current_min_aligned =
true;
586 if (!left_current_min_aligned && left_right_max_aligned &&
587 (lol_current_min_aligned || lol_is_left))
589 const int dx_lrc =
std::abs(lline_col_max - cline_col_max);
590 const int l_char_width = lines(l).char_width();
592 rline_col_min) - cline_col_min);
594 if (dx_lrc > l_char_width &&
595 dx_indent < 4 * delta_alignment &&
596 cline_col_max < lline_col_max &&
597 cline_col_min < lline_col_min &&
598 (lline_col_min > lolline_col_min || lol_is_left))
622 const line_id_t ror_nbh =
right(right_nbh);
623 const box2d& ror_line_bbox = lines(ror_nbh).
bbox();
624 const int rorline_col_min = ror_line_bbox.
pmin().
col();
626 bool right_ror_min_aligned =
false;
627 bool left_right_aligned =
false;
628 const int dx_lr =
std::abs(lline_col_min - rline_col_min);
629 const int dx_rror_min =
std::abs(rline_col_min - rorline_col_min);
631 if (dx_rror_min < delta_alignment)
632 right_ror_min_aligned =
true;
634 if (dx_lr < delta_alignment)
635 left_right_aligned =
true;
637 if (right_ror_min_aligned && left_right_aligned &&
638 ror_nbh != right_nbh)
640 const int left_right_col_min =
std::min(lline_col_min, rline_col_min);
641 const int dx_lrc =
std::abs(left_right_col_min - cline_col_min);
642 const float l_char_width = 1.5f * lines(l).char_width();
644 if (dx_lrc > l_char_width &&
645 !may_have_another_left_link(right, right_nbh, l, lines) &&
646 dx_lrc < 10.0f * l_char_width &&
647 cline_col_min > rline_col_min &&
648 cline_col_min > lline_col_min)
650 const line_id_t out_right_nbh = output(right_nbh);
652 if (out_right_nbh != l)
654 if (output(out_right_nbh) == l)
655 output(out_right_nbh) = out_right_nbh;
659 output(right_nbh) = right_nbh;
681 const line_id_t ror_nbh =
right(right_nbh);
682 const box2d& ror_line_bbox = lines(ror_nbh).
bbox();
683 const int rorline_col_min = ror_line_bbox.
pmin().
col();
685 bool left_ror_aligned =
false;
686 const int dx_lror =
std::abs(lline_col_min - rorline_col_min);
688 if (dx_lror < delta_alignment)
689 left_ror_aligned =
true;
691 if (left_ror_aligned)
693 const int left_ror_col_min =
std::min(lline_col_min, rorline_col_min);
694 const int dx_lrorc =
std::abs(left_ror_col_min - cline_col_min);
695 const float l_char_width = 1.5f * lines(l).char_width();
696 const int dx_lrorr =
std::abs(left_ror_col_min - rline_col_min);
697 const int dx_crmax =
std::abs(rline_col_max - cline_col_max);
699 if (dx_lrorc > l_char_width &&
700 dx_lrorr > 5 * l_char_width &&
701 dx_lrorr > dx_lrorc &&
702 dx_crmax > 5 * l_char_width &&
703 !may_have_another_left_link(right, right_nbh, l, lines) &&
704 dx_lrorc < 10.0f * l_char_width &&
705 cline_col_min > rorline_col_min &&
706 cline_col_min > lline_col_min)
708 right(right_nbh) = right_nbh;
717 if (rline_col_min > current_line_bbox.
pcenter().
col()
718 && !may_have_another_left_link(right, right_nbh, l, lines)
719 && cline_col_max < rline_col_max
720 && output(right_nbh) == l)
722 output(right_nbh) = right_nbh;
741 const line_id_t ror_nbh =
right(right_nbh);
742 const box2d& ror_line_bbox = lines(ror_nbh).
bbox();
743 const int rorline_col_min = ror_line_bbox.
pmin().
col();
745 bool right_ror_min_aligned =
false;
746 const int dx_rror_min =
std::abs(rline_col_min - rorline_col_min);
748 if (dx_rror_min < delta_alignment)
749 right_ror_min_aligned =
true;
751 if (right_ror_min_aligned)
753 const int right_ror_col_min =
std::min(rline_col_min, rorline_col_min);
754 const int dx_rrorc =
std::abs(right_ror_col_min - cline_col_min);
755 const float l_char_width = 1.5f * lines(l).char_width();
757 if (dx_rrorc > l_char_width &&
758 dx_rrorc < 10.0f * l_char_width &&
759 cline_col_min > rline_col_min &&
760 cline_col_max >= rline_col_max)
762 const line_id_t out_right_nbh = output(right_nbh);
764 if (out_right_nbh != l)
766 if (output(out_right_nbh) == l)
767 output(out_right_nbh) = out_right_nbh;
771 output(right_nbh) = right_nbh;
788 template <
typename L>
790 void prepare_lines(
const box2d& domain,
791 const line_set<L>& lines,
795 std::map< int, std::vector< const box2d* > > drawn_lines;
800 for_all_lines(l, lines)
801 if (lines(l).is_textline())
808 const unsigned index = l + 1;
809 const unsigned even_index = 2 * index;
810 const unsigned odd_index = even_index + 1;
814 bool not_finished =
true;
822 const int col = b.
pmax().
col() + col_offset;
823 std::map< int, std::vector< const box2d* > >::iterator it
824 = drawn_lines.find(col);
826 if (it != drawn_lines.end())
828 const std::vector< const box2d* >& lines = (*it).second;
829 const unsigned nb_lines = lines.size();
832 for (i = 0; i < nb_lines; ++i)
838 if (min_row - max_row <= 0)
846 not_finished =
false;
847 drawn_lines[col].push_back(&(rbbox[l]));
856 not_finished =
false;
857 drawn_lines[col].push_back(&(rbbox[l]));
864 bool not_finished =
true;
872 const int col = b.
pmin().
col() - col_offset;
873 std::map< int, std::vector< const box2d* > >::iterator it
874 = drawn_lines.find(col);
876 if (it != drawn_lines.end())
878 const std::vector< const box2d* >& lines = (*it).second;
879 const unsigned nb_lines = lines.size();
882 for (i = 0; i < nb_lines; ++i)
884 const box2d* box = lines[i];
888 if (min_row - max_row <= 0)
896 not_finished =
false;
897 drawn_lines[col].push_back(&(rbbox[l]));
906 not_finished =
false;
907 drawn_lines[col].push_back(&(rbbox[l]));
914 template <
typename L>
917 process_left_link(L& blocks,
919 const line_set<L>& lines,
925 for_all_lines(l, lines)
926 if (lines(l).is_textline())
934 for_all_lines(l, lines)
935 if (lines(l).is_textline())
938 int dmax = 1.5f * lines(l).x_height();
941 point2d c = rbbox(l).pcenter();
942 point2d q(rbbox(l).pmin().row() + ((c.
row() - rbbox(l).pmin().row()) / 4), c.
col());
945 midcol = (rbbox(l).pmax().col()
946 - rbbox(l).pmin().col()) / 2;
952 nleftima = c.
col() - blocks.domain().pmin().col(),
954 nleft =
std::min(nleftima, midcol + dmax);
961 *pstop = p - nleft - 1,
966 for (; p != pstop; --
p, --p2)
970 &&
left((*p2 >> 1) - 1) != l)
979 &&
left((*p >> 1) - 1) != l)
1010 std::vector<V> lines_nbh;
1011 const V end_p = *nbh_p + 1;
1012 const V* nbh_p_copy = nbh_p;
1014 for (; *nbh_p != end_p; --nbh_p)
1018 if ((*nbh_p) % 2 == 0)
1020 lines_nbh.push_back(*nbh_p);
1028 if (
std::find(lines_nbh.begin(), lines_nbh.end(),
1029 (*nbh_p) - 1) != lines_nbh.end())
1035 &&
left(((*nbh_p - 1) >> 1) - 1) != l)
1036 left(l) = ((*nbh_p - 1) >> 1) - 1;
1049 if (*nbh_p == end_p)
1050 left(l) = (*nbh_p_copy >> 1) - 1;
1058 template <
typename L>
1061 process_right_link(L& blocks,
1063 const line_set<L>& lines,
1064 line_links<L>& right)
1069 for_all_lines(l, lines)
1070 if (lines(l).is_textline())
1078 for_all_lines(l, lines)
1079 if (lines(l).is_textline())
1082 int dmax = 1.5f * lines(l).x_height();
1085 point2d c = rbbox(l).pcenter();
1086 point2d q(rbbox(l).pmax().row() - ((rbbox(l).pmax().row() - c.
row()) / 4), c.
col());
1089 midcol = (rbbox(l).pmax().col()
1090 - rbbox(l).pmin().col()) / 2;
1095 nrightima =
geom::ncols(blocks) - c.
col() + blocks.domain().pmin().col(),
1096 nright =
std::min(nrightima, midcol + dmax);
1103 *pstop = p + nright - 1,
1108 for (; p != pstop; ++
p, ++p2)
1112 &&
right(((*p2 - 1) >> 1) - 1) != l)
1121 &&
right(((*p - 1) >> 1) - 1) != l)
1152 std::vector<V> lines_nbh;
1153 const V end_p = *nbh_p - 1;
1154 const V* nbh_p_copy = nbh_p;
1156 for (; *nbh_p != end_p; ++nbh_p)
1160 if (*nbh_p % 2 == 1)
1162 lines_nbh.push_back(*nbh_p);
1170 if (
std::find(lines_nbh.begin(), lines_nbh.end(),
1171 *nbh_p + 1) != lines_nbh.end())
1177 &&
right((*nbh_p >> 1) - 1) != l)
1178 right(l) = (*nbh_p >> 1) - 1;
1191 if (*nbh_p == end_p)
1192 right(l) = ((*nbh_p_copy - 1) >> 1) - 1;
1203 template<
typename L >
1205 void finalize_links(line_links<L>& left,
1206 line_links<L>& right,
1207 const line_set<L>& lines)
1211 for_all_lines(l, lines)
1212 if (lines(l).is_textline())
1214 const unsigned left_value =
left(l);
1215 const unsigned right_value =
right(l);
1220 line_id_t& v =
right(left_value);
1222 if (v == left_value)
1229 line_id_t& v =
left(right_value);
1231 if (v == right_value)
1240 template <
typename L>
1260 line_links<L>
left(lines);
1262 line_links<L>
right(lines);
1264 line_links<L> output(lines);
1267 rbbox.
resize(lines.nelements() + 1);
1269 internal::prepare_lines(input.
domain(), lines , blocks, rbbox);
1270 internal::process_left_link(blocks, rbbox, lines , left);
1271 internal::process_right_link(blocks, rbbox, lines , right);
1272 internal::finalize_links(left, right, lines );
1274 internal::paragraph_links(left, right, output, lines);
1276 paragraph_set<L> par_set = make::paragraph(output, right);
1280 # endif // ! MLN_INCLUDE_ONLY
1286 #endif // ! SCRIBO_TEXT_EXTRACT_PARAGRAPHS_HDOC_HH