27 #ifndef SCRIBO_TEXT_MERGING_HH
28 # define SCRIBO_TEXT_MERGING_HH
42 #include <mln/core/image/image2d.hh>
43 #include <mln/core/image/dmorph/image_if.hh>
44 #include <mln/util/array.hh>
45 #include <mln/io/pbm/load.hh>
46 #include <mln/io/pgm/save.hh>
48 #include <mln/data/fill.hh>
49 #include <mln/data/wrap.hh>
51 #include <mln/make/box2d.hh>
53 #include <mln/value/rgb8.hh>
54 #include <mln/io/ppm/save.hh>
56 #include <mln/draw/box.hh>
57 #include <mln/data/stretch.hh>
58 #include <mln/data/wrap.hh>
59 #include <mln/util/timer.hh>
61 #include <scribo/text/look_like_text_lines.hh>
91 # ifndef MLN_INCLUDE_ONLY
101 template <
typename T,
typename T2>
108 T* p_start = & input(b.
pmin());
110 for (
unsigned r = 0; r <
nrows; ++r)
113 for (
unsigned c = 0; c <
ncols; ++c)
122 template <
typename T,
typename T2>
124 int pmin_row,
int pmin_col,
125 int pmax_row,
int pmax_col,
128 if (pmax_row < pmin_row || pmax_col < pmin_col)
132 input_nrows_1 = input.
nrows() - 1,
133 input_ncols_1 = input.
ncols() - 1;
134 if (pmin_row < 0) pmin_row = 0;
135 if (pmin_col < 0) pmin_col = 0;
136 if (pmax_row > input_nrows_1) pmax_row = input_nrows_1;
137 if (pmax_col > input_ncols_1) pmax_col = input_ncols_1;
141 nrows = pmax_row - pmin_row + 1,
142 ncols = pmax_col - pmin_col + 1;
143 T* p_start = & input.
at_(pmin_row, pmin_col);
145 for (
unsigned r = 0; r <
nrows; ++r)
148 for (
unsigned c = 0; c <
ncols; ++c)
162 return parent[x] = my_find_root(parent, parent[x]);
167 void swap_ordering(
unsigned l1,
unsigned l2)
178 template <
typename L>
184 l1 = my_find_root(parent, l1);
185 l2 = my_find_root(parent, l2);
191 swap_ordering(l1, l2);
194 line_info<L>& l1_info = lines(l1);
195 line_info<L>& l2_info = lines(l2);
197 if (l2_info.card() > l1_info.card())
201 std::swap(l1_info, l2_info);
202 l1_info.fast_merge(tmp);
207 l2_info.update_tag(line::Merged);
208 l2_info.set_hidden(
true);
211 l1_info.fast_merge(l2_info);
231 template <
typename L>
232 bool between_separators(
const line_set<L>& lines,
233 const line_id_t& l1_,
234 const line_id_t& l2_)
237 mln_precondition(lines.components().has_separators());
243 const box2d& l2_bbox = l2.bbox();
253 typedef const bool* sep_ptr_t;
254 sep_ptr_t sep_ptr, sep_ptr_top, sep_ptr_bottom, end;
258 const unsigned quarter =
266 end = sep_ptr + col2 - col1;
270 const unsigned quarter =
278 end = sep_ptr + col1 - col2;
282 while (!*sep_ptr && !*sep_ptr_top && !*sep_ptr_bottom && sep_ptr != end)
289 return (*sep_ptr || *sep_ptr_top || *sep_ptr_bottom);
301 template <
typename L>
303 const line_id_t& l1_,
304 const line_id_t& l2_)
310 const float x_ratio_max = 1.7f;
311 const float baseline_delta_max =
312 0.5f *
std::min(l1.x_height(), l2.x_height());
314 const box2d& l1_bbox = l1.bbox();
315 const box2d& l2_bbox = l2.bbox();
317 const point2d& l1_pmin = l1_bbox.pmin();
319 const point2d& l1_pmax = l1_bbox.pmax();
322 const bool l1_has_separators = lines.
components().has_separators();
323 const bool l1_l2_between_separators = (l1_has_separators) ?
324 between_separators(lines, l1_, l2_) :
false;
325 const float l_ted_cw = l2.char_width();
337 && l1_bbox.height() < l2.x_height())
340 && (dx < l_ted_cw && dy < 0)
342 && not (l1_l2_between_separators))
345 l1.update_type(line::Punctuation);
353 top_row_l2 = l2_pmin.
row(),
354 top_row_l1 = l1_pmin.
row(),
355 bot_row = l2_pmax.
row();
356 const float x1 = l1.x_height(), x2 = l2.x_height();
360 !l1_l2_between_separators
364 && (
std::abs(bot_row - l1.baseline()) < baseline_delta_max)
366 && (
std::abs(top_row_l2 - top_row_l1) < 5)
368 && dx < 5.0f * l_ted_cw)
376 const float x1 = l1.x_height(), x2 = l2.x_height();
378 if (x_ratio > x_ratio_max)
384 if (
std::abs(l1.baseline() - l2.baseline()) > baseline_delta_max)
390 col1 = l1_bbox.pcenter().col(),
394 if ((col1 + l1_bbox.width() / 4) >= (col2 - l2_bbox.width() / 4))
398 if ((col2 + l2_bbox.width() / 4) >= (col1 - l1_bbox.width() / 4))
402 if (l1_has_separators)
403 return ! l1_l2_between_separators;
412 int horizontal_distance(
const box2d& l1,
439 template <
typename L>
440 bool non_text_and_text_can_merge(line_set<L>& lines,
441 const line_id_t& l_cur_,
442 const line_id_t& l_ted_)
448 if (l_cur.type() == line::Text || l_ted.type() != line::Text)
455 if (lines.components().has_separators()
456 && between_separators(lines, l_cur_, l_ted_))
459 const box2d& l_cur_bbox = l_cur.bbox();
460 const box2d& l_ted_bbox = l_ted.bbox();
469 const float l_ted_cw = l_ted.char_width();
470 const float l_ted_x_height = l_ted.x_height();
472 const unsigned l_cur_height = l_cur_bbox.height();
473 const unsigned l_cur_width = l_cur_bbox.width();
476 if (l_cur_height < l_ted_x_height
477 && l_cur_height > 0.05f * l_ted_x_height
478 &&
float(l_cur_width) /
float(l_cur.card()) < l_ted.char_width()
480 && l_cur_pmin.
row() < l_ted_pmax.
row())
482 l_cur.update_type(line::Punctuation);
488 l_cur_height < l_ted_x_height
490 && l_cur_width > 0.8 * l_ted_cw
491 && l_cur_width < 5 * l_ted_cw
493 &&
std::abs((l_ted.baseline() + l_ted.meanline()) / 2 - l_cur.bbox().
pcenter().
row()) < 7
495 &&
unsigned(horizontal_distance(l_cur_bbox, l_ted_bbox)) < 2 * l_ted_cw
506 top_row = l_cur.bbox().
pmin().
row(),
507 bot_row = l_cur.bbox().
pmax().
row();
511 if ((
std::abs(bot_row - l_ted.baseline()) < 5
514 (
std::abs(top_row - l_ted.meanline()) < 5
516 && dx < 5.0f * l_ted_cw)
614 template <
typename L>
616 one_merge_pass(
unsigned ith_pass,
618 std::vector<scribo::line_id_t>& v,
628 const unsigned n = v.size();
632 count_txtline_IN_txtline = 0,
633 count_txtline_IN_junk = 0,
634 count_two_lines_merge = 0,
635 count_new_txtline = 0,
636 count_comp_IN_txtline = 0,
637 count_comp_HITS_txtline = 0,
640 for (
int i = n - 1; i >= 0; --i)
647 const box2d& b = lines(l).bbox();
649 const box2d& b_ = lines(l).ebbox();
665 const unsigned tl = billboard(b_.
pmin());
670 const unsigned bl = billboard.at_(b_.
pmax().
row(), b_.
pmin().
col());
671 const unsigned br = billboard(b_.
pmax());
673 typedef std::set<unsigned> set_t;
674 std::set<unsigned> labels;
685 for (set_t::const_iterator it = labels.begin();
692 if (lines(*it).type() != line::Text)
693 std::cerr <<
"outch: we have hit, so drawn, a non-text..." << std::endl;
697 if (labels.size() == 1)
703 const line_info<L>& l_info = lines(l);
704 const line_info<L>& mc_info = lines(mc);
706 if (l_info.type() == line::Text)
708 if (mc_info.type() == line::Text)
710 ++count_txtline_IN_txtline;
732 const float l_ted_cw = mc_info.char_width();
738 if ((l_info.card() <= 5 ||
739 (
std::abs(l_info.baseline() - mc_info.baseline()) < 5
740 &&
std::abs(l_info.meanline() - mc_info.meanline()) < 5))
741 && dx < l_ted_cw && dy < 0
743 && between_separators(lines, l, mc)))
744 l = do_union(lines, l, mc, parent);
761 draw_box(log, b, 126);
767 std::cerr <<
"error: should NOT happen (a text line included in a NON-text-line (so not drawn!!!)" << std::endl;
768 ++count_txtline_IN_junk;
771 draw_box(billboard, lines(l).ebbox(), l);
773 draw_box(log, b, 100);
779 if (lines(mc).type() == line::Text)
781 ++count_comp_IN_txtline;
793 if (!non_text_and_text_can_merge(lines, l, mc))
804 lines(l).update_type(line::Punctuation);
807 l_ = do_union(lines, mc, l, parent);
810 draw_box(billboard, lines(l_).ebbox(), l_);
813 draw_box(log, b, 128);
822 if (lines(l).type() == line::Text)
825 draw_box(billboard, lines(l).ebbox(), l);
827 draw_box(log, b, 127);
838 for (set_t::const_iterator it = labels.begin();
842 unsigned lcand = *it;
847 if (lines(l_).type() == line::Text)
850 if (lines_can_merge(lines, l_, lcand))
852 ++count_two_lines_merge;
853 l_ = do_union(lines, l_, lcand, parent);
855 draw_box(billboard, lines(l_).ebbox(), l_);
857 draw_box(log, b, 151);
864 draw_box(log, b, 255);
867 draw_box(billboard, lines(l_).ebbox(), l_);
873 ++count_comp_HITS_txtline;
874 if (non_text_and_text_can_merge(lines, l_, lcand))
877 ++count_comp_HITS_txtline;
878 l_ = do_union(lines, l_, lcand, parent);
879 draw_box(billboard, lines(l_).ebbox(), l_);
882 draw_box(log, b, 169);
888 draw_box(log, b, 254);
952 template <
typename L>
953 struct order_lines_id
962 const unsigned l1_nsites = lines_(l1).bbox().nsites();
963 const unsigned l2_nsites = lines_(l2).bbox().nsites();
965 if (l1_nsites == l2_nsites)
967 return l1_nsites < l2_nsites;
974 template <
typename L>
976 draw_boxes(
const box2d& input_domain,
982 order_lines_id<L> func(lines);
983 std::vector<scribo::line_id_t> v;
991 for (
unsigned l = 1; l < parent.
nelements(); ++l)
998 std::sort(v.begin(), v.end(), func);
1002 for_all_lines(l, lines)
1003 if (looks_like_a_text_line(lines(l)))
1004 lines(l).update_type(
line::Text);
1008 one_merge_pass(1, input_domain, v, lines, parent);
1015 std::sort(v.begin(), v.end(), func);
1019 one_merge_pass(2, input_domain, v, lines, parent);
1022 lines.force_stats_update();
1033 template <typename L>
1035 merging(const scribo::line_set<L>& lines)
1037 using namespace mln;
1040 = internal::draw_boxes(lines.components().labeled_image().domain(),
1045 # endif // ! MLN_INCLUDE_ONLY
1051 #endif // ! SCRIBO_TEXT_MERGING_HH