32 #ifndef SCRIBO_TEXT_EXTRACT_PARAGRAPHS_HH
33 # define SCRIBO_TEXT_EXTRACT_PARAGRAPHS_HH
35 #include <mln/util/array.hh>
36 #include <mln/accu/shape/bbox.hh>
37 #include <mln/core/image/image2d.hh>
38 #include <mln/core/alias/neighb2d.hh>
39 #include <mln/draw/box.hh>
40 #include <mln/data/convert.hh>
41 #include <mln/value/int_u16.hh>
42 #include <mln/value/label_16.hh>
43 #include <mln/value/int_u8.hh>
44 #include <mln/value/rgb8.hh>
45 #include <mln/io/ppm/save.hh>
46 #include <mln/io/pgm/save.hh>
47 #include <mln/geom/rotate.hh>
48 #include <mln/literal/colors.hh>
50 #include <scribo/core/def/lbl_type.hh>
51 #include <scribo/core/macros.hh>
52 #include <scribo/core/line_set.hh>
53 #include <scribo/core/line_links.hh>
54 #include <scribo/core/line_info.hh>
55 #include <scribo/core/paragraph_set.hh>
75 # ifndef MLN_INCLUDE_ONLY
90 while (parent(tmp_x) != tmp_x)
91 tmp_x = parent(tmp_x);
93 while (parent(x) != x)
95 const unsigned tmp = parent(x);
108 template <
typename L>
111 between_horizontal_separator(
const line_set<L>& lines,
116 mln_precondition(lines.components().has_separators());
122 const box2d& l2_bbox = l2.bbox();
132 unsigned left_col_ptr;
133 unsigned right_col_ptr;
141 const unsigned quarter =
146 left_col_ptr = l1_bbox.
pmin().
col() + quarter;
147 right_col_ptr = l1_bbox.
pmax().
col() - quarter;
155 const unsigned quarter =
160 left_col_ptr = l2_bbox.
pmin().
col() + quarter;
161 right_col_ptr = l2_bbox.
pmax().
col() - quarter;
183 template <
typename L>
185 void paragraph_links(
const line_links<L>&
left,
186 const line_links<L>&
right,
187 line_links<L>& output,
188 const line_set<L>& lines)
190 output = left.duplicate();
202 for_all_lines(l, lines)
203 if (lines(l).is_textline())
206 line_id_t left_nbh = output(l);
207 line_id_t right_nbh =
right(l);
208 line_id_t lol_nbh = output(left_nbh);
210 if (lines.components().has_separators() &&
211 between_horizontal_separator(lines, right_nbh, l))
213 if (output(right_nbh) == l)
215 output(right_nbh) = right_nbh;
219 if (lines.components().has_separators() &&
220 between_horizontal_separator(lines, l, left_nbh))
228 const float x_height = lines(l).x_height();
229 const float left_x_height = lines(left_nbh).x_height();
230 const float right_x_height = lines(right_nbh).x_height();
232 const box2d& left_line_bbox = lines(left_nbh).
bbox();
233 const box2d& current_line_bbox = lines(l).
bbox();
234 const box2d& right_line_bbox = lines(right_nbh).
bbox();
235 const box2d& lol_line_bbox = lines(lol_nbh).
bbox();
237 const int lline_col_min = left_line_bbox.
pmin().
col();
238 const int cline_col_min = current_line_bbox.
pmin().
col();
239 const int rline_col_min = right_line_bbox.
pmin().
col();
240 const int lolline_col_min = lol_line_bbox.
pmin().
col();
242 const int lline_col_max = left_line_bbox.
pmax().
col();
243 const int cline_col_max = current_line_bbox.
pmax().
col();
244 const int rline_col_max = right_line_bbox.
pmax().
col();
246 const int lline_cw = lines(left_nbh).char_width();
247 const int cline_cw = lines(l).char_width();
248 const int rline_cw = lines(right_nbh).char_width();
250 const int delta_alignment = cline_cw;
255 const int c_baseline = lines(l).baseline();
258 const int lc_baseline = lines(left_nbh).baseline() - c_baseline;
259 const int rc_baseline = c_baseline -lines(right_nbh).baseline();
262 const float delta_baseline_max =
std::max(lc_baseline, rc_baseline);
263 const float delta_baseline_min =
std::min(lc_baseline,
267 bool two_lines =
false;
270 if (lc_baseline == 0)
273 const line_id_t ror_nbh =
right(right_nbh);
277 if (ror_nbh != right_nbh
278 && output(ror_nbh) == right_nbh)
281 const float right_distance = lines(l).meanline() - lines(right_nbh).baseline();
283 const float ror_distance = lines(right_nbh).meanline() - lines(ror_nbh).baseline();
285 const float ror_x_height = lines(ror_nbh).x_height();
289 if (right_distance > 1.4f * ror_distance
290 &&
std::max(ror_x_height, right_x_height) <
291 1.2f *
std::min(ror_x_height, right_x_height)
292 && output(right_nbh) == l)
294 output(right_nbh) = right_nbh;
302 const float distance = lines(l).meanline() - lines(right_nbh).baseline();
308 if (distance > 2.0f *
std::min(x_height, right_x_height)
309 && output(right_nbh) == l)
311 output(right_nbh) = right_nbh;
317 const float min_x_height =
std::min(x_height, right_x_height);
318 const float max_x_height =
std::max(x_height, right_x_height);
319 const float min_char_width =
std::min(rline_cw, cline_cw);
320 const float max_char_width =
std::max(rline_cw, cline_cw);
324 if ((max_x_height > min_x_height * 1.2f) &&
325 !(max_char_width <= 1.2f * min_char_width))
327 if (output(right_nbh) == l)
329 output(right_nbh) = right_nbh;
339 else if (rc_baseline == 0)
344 if (lol_nbh != left_nbh)
347 const float left_distance = lines(left_nbh).meanline() -
351 const float lol_distance = lines(lol_nbh).meanline() -
352 lines(left_nbh).baseline();
354 const float lol_x_height = lines(lol_nbh).x_height();
358 if (left_distance > 1.4f * lol_distance
359 &&
std::max(lol_x_height, left_x_height) <
360 1.2f *
std::min(lol_x_height, left_x_height))
370 const float distance = lines(left_nbh).meanline() -
377 if (distance > 2.0f *
std::min(x_height, left_x_height))
385 const float min_x_height =
std::min(x_height, left_x_height);
386 const float max_x_height =
std::max(x_height, left_x_height);
387 const float min_char_width =
std::min(lline_cw, cline_cw);
388 const float max_char_width =
std::max(lline_cw, cline_cw);
392 if ((max_x_height > min_x_height * 1.2f) &&
393 !(max_char_width <= 1.2f * min_char_width))
404 else if (delta_baseline_max >= 1.1f * delta_baseline_min)
407 const float left_distance =
408 lines(left_nbh).meanline() - lines(l).baseline();
410 const float right_distance =
411 lines(l).meanline() - lines(right_nbh).baseline();
415 if ((left_distance > 1.2f * right_distance
416 &&
std::max(x_height, left_x_height) > 1.2f *
418 || (left_distance > 2.0 * right_distance))
425 else if (((right_distance > 1.2f * left_distance
426 &&
std::max(x_height, right_x_height) > 1.2f *
428 || (right_distance > 2.0f * left_distance))
429 && output(right_nbh) == l)
431 output(right_nbh) = right_nbh;
440 if (lc_baseline > rc_baseline)
442 const float cw_max =
std::max(lline_cw, cline_cw);
443 const float cw_min =
std::min(lline_cw, cline_cw);
444 const float min_x_height =
std::min(x_height, left_x_height);
445 const float max_x_height =
std::max(x_height, left_x_height);
447 if ((max_x_height > min_x_height * 1.2f) &&
448 !(cw_max <= 1.2f * cw_min))
455 const float min_x_height =
std::min(x_height, right_x_height);
456 const float max_x_height =
std::max(x_height, right_x_height);
457 const float cw_max =
std::max(rline_cw, cline_cw);
458 const float cw_min =
std::min(rline_cw, cline_cw);
460 if ((max_x_height > min_x_height * 1.2f)
461 && !(cw_max <= 1.2f * cw_min)
462 && output(right_nbh) == l)
464 output(right_nbh) = right_nbh;
471 const float cw_max =
std::max(rline_cw, cline_cw);
472 const float cw_min =
std::min(rline_cw, cline_cw);
473 const float min_x_height =
std::min(x_height, right_x_height);
474 const float max_x_height =
std::max(x_height, right_x_height);
476 if ((max_x_height > min_x_height * 1.2f)
477 && !(cw_max <= 1.2f * cw_min)
478 && output(right_nbh) == l)
480 output(right_nbh) = right_nbh;
485 const float min_x_height =
std::min(x_height, left_x_height);
486 const float max_x_height =
std::max(x_height, left_x_height);
487 const float cw_max =
std::max(lline_cw, cline_cw);
488 const float cw_min =
std::min(lline_cw, cline_cw);
490 if ((max_x_height > min_x_height * 1.2f)
491 && !(cw_max <= 1.2f * cw_min))
523 bool left_right_aligned =
false;
524 bool left_lol_aligned =
false;
525 const int dx_lr =
std::abs(lline_col_min - rline_col_min);
526 const int dx_llol =
std::abs(lline_col_min - lolline_col_min);
528 if (dx_lr < delta_alignment)
529 left_right_aligned =
true;
531 if (dx_llol < delta_alignment)
532 left_lol_aligned =
true;
534 if (left_right_aligned && left_lol_aligned)
536 const int left_right_col_min =
std::min(lline_col_min, rline_col_min);
537 const int dx_lrc =
std::abs(left_right_col_min - cline_col_min);
538 const float l_char_width = 1.5f * lines(l).char_width();
540 if (dx_lrc > l_char_width &&
541 dx_lrc < 3.0f * l_char_width &&
542 cline_col_min > rline_col_min &&
543 cline_col_min > lline_col_min)
545 output(right_nbh) = right_nbh;
567 bool left_right_max_aligned =
false;
568 bool left_current_min_aligned =
false;
569 bool lol_current_min_aligned =
false;
570 const bool lol_is_left = output(left_nbh) == left_nbh;
571 const int dx_lr_max =
std::abs(lline_col_max - rline_col_max);
572 const int dx_lc_min =
std::abs(lline_col_min - cline_col_min);
573 const int dx_lolc_min =
std::abs(lolline_col_min - cline_col_min);
575 if (dx_lr_max < delta_alignment)
576 left_right_max_aligned =
true;
578 if (dx_lc_min < delta_alignment)
579 left_current_min_aligned =
true;
581 if (dx_lolc_min < delta_alignment)
582 lol_current_min_aligned =
true;
584 if (!left_current_min_aligned && left_right_max_aligned &&
585 (lol_current_min_aligned || lol_is_left))
587 const int dx_lrc =
std::abs(lline_col_max - cline_col_max);
588 const int l_char_width = lines(l).char_width();
590 if (dx_lrc > l_char_width &&
591 cline_col_max < lline_col_max &&
592 cline_col_min < lline_col_min &&
593 (lline_col_min > lolline_col_min || lol_is_left))
617 const line_id_t ror_nbh =
right(right_nbh);
618 const box2d& ror_line_bbox = lines(ror_nbh).
bbox();
619 const int rorline_col_min = ror_line_bbox.
pmin().
col();
621 bool right_ror_min_aligned =
false;
622 const int dx_rror_min =
std::abs(rline_col_min - rorline_col_min);
624 if (dx_rror_min < delta_alignment)
625 right_ror_min_aligned =
true;
627 if (right_ror_min_aligned)
629 const int right_ror_col_min =
std::min(rline_col_min, rorline_col_min);
630 const int dx_rrorc =
std::abs(right_ror_col_min - cline_col_min);
631 const float l_char_width = 1.5f * lines(l).char_width();
633 if (dx_rrorc > l_char_width &&
634 dx_rrorc < 3.0f * l_char_width &&
635 cline_col_min > rline_col_min &&
636 cline_col_max >= rline_col_max)
638 output(right_nbh) = right_nbh;
690 template <
typename L>
692 void prepare_lines(
const box2d& domain,
693 const line_set<L>& lines,
697 std::map< int, std::vector< const box2d* > > drawn_lines;
702 for_all_lines(l, lines)
703 if (lines(l).is_textline())
710 const unsigned index = l + 1;
711 const unsigned even_index = 2 * index;
712 const unsigned odd_index = even_index + 1;
716 bool not_finished =
true;
724 const int col = b.
pmax().
col() + col_offset;
725 std::map< int, std::vector< const box2d* > >::iterator it
726 = drawn_lines.find(col);
728 if (it != drawn_lines.end())
730 const std::vector< const box2d* >& lines = (*it).second;
731 const unsigned nb_lines = lines.size();
734 for (i = 0; i < nb_lines; ++i)
740 if (min_row - max_row <= 0)
748 not_finished =
false;
749 drawn_lines[col].push_back(&(rbbox[l]));
758 not_finished =
false;
759 drawn_lines[col].push_back(&(rbbox[l]));
766 bool not_finished =
true;
774 const int col = b.
pmin().
col() - col_offset;
775 std::map< int, std::vector< const box2d* > >::iterator it
776 = drawn_lines.find(col);
778 if (it != drawn_lines.end())
780 const std::vector< const box2d* >& lines = (*it).second;
781 const unsigned nb_lines = lines.size();
784 for (i = 0; i < nb_lines; ++i)
786 const box2d* box = lines[i];
790 if (min_row - max_row <= 0)
798 not_finished =
false;
799 drawn_lines[col].push_back(&(rbbox[l]));
808 not_finished =
false;
809 drawn_lines[col].push_back(&(rbbox[l]));
816 template <
typename L>
819 process_left_link(L& blocks,
821 const line_set<L>& lines,
827 for_all_lines(l, lines)
828 if (lines(l).is_textline())
836 for_all_lines(l, lines)
837 if (lines(l).is_textline())
840 int dmax = 1.5f * lines(l).x_height();
843 point2d c = rbbox(l).pcenter();
844 point2d q(rbbox(l).pmin().row() + ((c.
row() - rbbox(l).pmin().row()) / 4), c.
col());
847 midcol = (rbbox(l).pmax().col()
848 - rbbox(l).pmin().col()) / 2;
854 nleftima = c.
col() - blocks.domain().pmin().col(),
856 nleft =
std::min(nleftima, midcol + dmax);
863 *pstop = p - nleft - 1,
868 for (; p != pstop; --
p, --p2)
872 &&
left((*p2 >> 1) - 1) != l)
881 &&
left((*p >> 1) - 1) != l)
912 std::vector<V> lines_nbh;
913 const V end_p = *nbh_p + 1;
914 const V* nbh_p_copy = nbh_p;
916 for (; *nbh_p != end_p; --nbh_p)
920 if ((*nbh_p) % 2 == 0)
922 lines_nbh.push_back(*nbh_p);
930 if (
std::find(lines_nbh.begin(), lines_nbh.end(),
931 (*nbh_p) - 1) != lines_nbh.end())
937 &&
left(((*nbh_p - 1) >> 1) - 1) != l)
938 left(l) = ((*nbh_p - 1) >> 1) - 1;
952 left(l) = (*nbh_p_copy >> 1) - 1;
960 template <
typename L>
963 process_right_link(L& blocks,
965 const line_set<L>& lines,
966 line_links<L>& right)
971 for_all_lines(l, lines)
972 if (lines(l).is_textline())
980 for_all_lines(l, lines)
981 if (lines(l).is_textline())
984 int dmax = 1.5f * lines(l).x_height();
987 point2d c = rbbox(l).pcenter();
988 point2d q(rbbox(l).pmax().row() - ((rbbox(l).pmax().row() - c.
row()) / 4), c.
col());
991 midcol = (rbbox(l).pmax().col()
992 - rbbox(l).pmin().col()) / 2;
997 nrightima =
geom::ncols(blocks) - c.
col() + blocks.domain().pmin().col(),
998 nright =
std::min(nrightima, midcol + dmax);
1005 *pstop = p + nright - 1,
1010 for (; p != pstop; ++
p, ++p2)
1014 &&
right(((*p2 - 1) >> 1) - 1) != l)
1023 &&
right(((*p - 1) >> 1) - 1) != l)
1054 std::vector<V> lines_nbh;
1055 const V end_p = *nbh_p - 1;
1056 const V* nbh_p_copy = nbh_p;
1058 for (; *nbh_p != end_p; ++nbh_p)
1062 if (*nbh_p % 2 == 1)
1064 lines_nbh.push_back(*nbh_p);
1072 if (
std::find(lines_nbh.begin(), lines_nbh.end(),
1073 *nbh_p + 1) != lines_nbh.end())
1079 &&
right((*nbh_p >> 1) - 1) != l)
1080 right(l) = (*nbh_p >> 1) - 1;
1093 if (*nbh_p == end_p)
1094 right(l) = ((*nbh_p_copy - 1) >> 1) - 1;
1105 template<
typename L >
1107 void finalize_links(line_links<L>& left,
1108 line_links<L>& right,
1109 const line_set<L>& lines)
1113 for_all_lines(l, lines)
1114 if (lines(l).is_textline())
1116 const unsigned left_value =
left(l);
1117 const unsigned right_value =
right(l);
1122 line_id_t& v =
right(left_value);
1124 if (v == left_value)
1131 line_id_t& v =
left(right_value);
1133 if (v == right_value)
1141 template <
typename L>
1153 line_links<L>
left(lines);
1155 line_links<L>
right(lines);
1157 line_links<L> output(lines);
1160 rbbox.
resize(lines.nelements() + 1);
1163 internal::prepare_lines(input.
domain(), lines , blocks, rbbox);
1165 internal::process_left_link(blocks, rbbox, lines , left);
1167 internal::process_right_link(blocks, rbbox, lines , right);
1169 internal::finalize_links(left, right, lines );
1171 internal::paragraph_links(left, right, output, lines);
1173 paragraph_set<L> par_set = make::paragraph(output);
1177 # endif // ! MLN_INCLUDE_ONLY
1183 #endif // ! SCRIBO_TEXT_EXTRACT_PARAGRAPHS_HH