$extrastylesheet
Olena  User documentation 2.1
An Image Processing Platform
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
extract_paragraphs_hdoc.hh
1 // Copyright (C) 2010, 2011, 2012, 2013 EPITA Research and Development
2 // Laboratory (LRDE)
3 //
4 // This file is part of Olena.
5 //
6 // Olena is free software: you can redistribute it and/or modify it under
7 // the terms of the GNU General Public License as published by the Free
8 // Software Foundation, version 2 of the License.
9 //
10 // Olena is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 // General Public License for more details.
14 //
15 // You should have received a copy of the GNU General Public License
16 // along with Olena. If not, see <http://www.gnu.org/licenses/>.
17 //
18 // As a special exception, you may use this file as part of a free
19 // software project without restriction. Specifically, if other files
20 // instantiate templates or use macros or inline functions from this
21 // file, or you compile this file and link it with other files to produce
22 // an executable, this file does not by itself cause the resulting
23 // executable to be covered by the GNU General Public License. This
24 // exception does not however invalidate any other reasons why the
25 // executable file might be covered by the GNU General Public License.
26 
30 
31 
32 #ifndef SCRIBO_TEXT_EXTRACT_PARAGRAPHS_HDOC_HH
33 # define SCRIBO_TEXT_EXTRACT_PARAGRAPHS_HDOC_HH
34 
35 #include <mln/util/array.hh>
36 #include <mln/accu/shape/bbox.hh>
37 #include <mln/core/image/image2d.hh>
38 #include <mln/core/alias/neighb2d.hh>
39 #include <mln/draw/box.hh>
40 #include <mln/data/convert.hh>
41 #include <mln/value/int_u16.hh>
42 #include <mln/value/label_16.hh>
43 #include <mln/value/int_u8.hh>
44 #include <mln/value/rgb8.hh>
45 #include <mln/io/ppm/save.hh>
46 #include <mln/io/pgm/save.hh>
47 #include <mln/geom/rotate.hh>
48 #include <mln/literal/colors.hh>
49 
50 #include <scribo/core/def/lbl_type.hh>
51 #include <scribo/core/macros.hh>
52 #include <scribo/core/line_set.hh>
53 #include <scribo/core/line_links.hh>
54 #include <scribo/core/line_info.hh>
55 #include <scribo/core/paragraph_set.hh>
56 
57 using namespace mln;
58 
59 
60 namespace scribo
61 {
62 
63  namespace text
64  {
65 
70  template <typename L>
71  paragraph_set<L>
72  extract_paragraphs_hdoc(line_set<L>& lines,
73  const image2d<bool>& input);
74 
75 
76 # ifndef MLN_INCLUDE_ONLY
77 
78  namespace internal
79  {
80 
81  template <typename L>
82  inline
83  bool
84  between_horizontal_separator(const line_set<L>& lines,
85  const line_id_t& l1_,
86  const line_id_t& l2_)
87  {
88  const line_info<L>& l1 = lines(l1_);
89  const line_info<L>& l2 = lines(l2_);
90 
91  // No separators found in image.
92  mln_precondition(lines.components().has_separators());
93 
94  const box2d& l1_bbox = l1.bbox();
95  const box2d& l2_bbox = l2.bbox();
96 
97  unsigned
98  row1 = l1_bbox.pcenter().row(),
99  row2 = l2_bbox.pcenter().row();
100  const mln_ch_value(L, bool)&
101  separators = lines.components().separators();
102 
103  unsigned row;
104  unsigned col_ptr;
105  unsigned left_col_ptr;
106  unsigned right_col_ptr;
107  unsigned end;
108 
109  if (row1 < row2)
110  {
111  row1 = l1_bbox.pmax().row();
112  row2 = l2_bbox.pmin().row();
113 
114  const unsigned quarter =
115  ((l1_bbox.pcenter().col() - l1_bbox.pmin().col()) >> 2);
116 
117  row = l1_bbox.pcenter().row();
118  col_ptr = l1_bbox.pcenter().col();
119  left_col_ptr = l1_bbox.pmin().col() + quarter;
120  right_col_ptr = l1_bbox.pmax().col() - quarter;
121  end = row2;
122  }
123  else
124  {
125  row2 = l2_bbox.pmax().row();
126  row1 = l1_bbox.pmin().row();
127 
128  const unsigned quarter =
129  ((l2_bbox.pcenter().col() - l2_bbox.pmin().col()) >> 2);
130 
131  row = l2_bbox.pcenter().row();
132  col_ptr = l2_bbox.pcenter().col();
133  left_col_ptr = l2_bbox.pmin().col() + quarter;
134  right_col_ptr = l2_bbox.pmax().col() - quarter;
135  end = row1;
136  }
137 
138  // If sep_ptr is true, then a separator is reached.
139  while (row < end)
140  {
141  ++row;
142  if (separators.at_(row, col_ptr)
143  || separators.at_(row, left_col_ptr)
144  || separators.at_(row, right_col_ptr))
145  return true;
146  }
147 
148  return false;
149  }
150 
151 
152  template <typename L>
153  bool may_have_another_left_link(const line_links<L>& right,
154  const line_id_t& index,
155  const line_id_t& current_line,
156  const line_set<L>& lines)
157  {
158  const line_info<L>& l = lines(current_line);
159  const point2d& pmin = l.bbox().pmin();
160  const unsigned x1 = l.x_height();
161 
162  for_all_links(i, right)
163  if (i != index && right(i) == index)
164  {
165  const line_info<L>& l_info = lines(i);
166  const unsigned x2 = l_info.x_height();
167 
168  const float delta_max = 0.5f * std::min(x1, x2);
169 
170  if (l_info.bbox().pmin().col() < pmin.col()
171  && std::abs(l.baseline() - l_info.baseline()) < delta_max
172  )
173  return true;
174  }
175 
176  return false;
177  }
178 
179 //---------------------------------------------------------------------
180 // This method aims to cut the links between lines that do not fit the
181 // different criteria
182 //---------------------------------------------------------------------
183 
184  template <typename L>
185  inline
186  void paragraph_links(const line_links<L>& left,
187  line_links<L>& right,
188  line_links<L>& output,
189  const line_set<L>& lines)
190  {
191  output = left.duplicate();
192 
193  // const unsigned nlines = lines.nelements();
194 
195  // For each line
196  for_all_lines(l, lines)
197  if (lines(l).is_textline())
198  {
199  // Neighbors
200 
201  line_id_t left_nbh = output(l);
202  line_id_t right_nbh = right(l);
203  line_id_t lol_nbh = output(left_nbh);
204 
205  // const line_info<L>& left_line = lines(left_nbh);
206  // const line_info<L>& current_line = lines(l);
207  // const line_info<L>& right_line = lines(right_nbh);
208 
209  if (lines.components().has_separators() &&
210  between_horizontal_separator(lines, right_nbh, l))
211  {
212  output(right_nbh) = right_nbh;
213  right_nbh = l;
214  }
215  if (lines.components().has_separators() &&
216  between_horizontal_separator(lines, l, left_nbh))
217  {
218  output(l) = l;
219  left_nbh = l;
220  lol_nbh = l;
221  }
222 
223  // Line features
224  const float x_height = lines(l).x_height();
225  const float left_x_height = lines(left_nbh).x_height();
226  const float right_x_height = lines(right_nbh).x_height();
227 
228  const box2d& left_line_bbox = lines(left_nbh).bbox();
229  const box2d& current_line_bbox = lines(l).bbox();
230  const box2d& right_line_bbox = lines(right_nbh).bbox();
231  const box2d& lol_line_bbox = lines(lol_nbh).bbox(); // lol : left neighbor of the left neighbor
232 
233  const int lline_col_min = left_line_bbox.pmin().col();
234  const int cline_col_min = current_line_bbox.pmin().col();
235  const int rline_col_min = right_line_bbox.pmin().col();
236  const int lolline_col_min = lol_line_bbox.pmin().col();
237 
238  const int lline_col_max = left_line_bbox.pmax().col();
239  const int cline_col_max = current_line_bbox.pmax().col();
240  const int rline_col_max = right_line_bbox.pmax().col();
241 
242  const int lline_cw = lines(left_nbh).char_width();
243  const int cline_cw = lines(l).char_width();
244  const int rline_cw = lines(right_nbh).char_width();
245  // Maximal x variation to consider two lines vertically aligned
246  const int delta_alignment = cline_cw;
247 
248 
249  // Checks the baseline distances of the two neighbors
250  {
251  // Current line baseline
252  const int c_baseline = lines(l).baseline();
253 
254  // Baseline distance with the left and right neighbors
255  const int lc_baseline = lines(left_nbh).baseline() - c_baseline;
256  const int rc_baseline = c_baseline -lines(right_nbh).baseline();
257 
258  // Max baseline distance between the two neighbors
259  // const float delta_baseline_max = std::max(lc_baseline, rc_baseline);
260  // const float delta_baseline_min = std::min(lc_baseline,
261  // rc_baseline);
262 
263  // Only two lines, meaning the current line has only one neighbor
264  bool two_lines = false;
265 
266  // If the current line has no left neighbor
267  if (lc_baseline == 0)
268  {
269  // ror : right neighbor of the right neighbor
270  const line_id_t ror_nbh = right(right_nbh);
271  //const box2d& ror_line_bbox = lines(ror_nbh).bbox();
272 
273  // If the current line has a ror
274  if (ror_nbh != right_nbh
275  && output(ror_nbh) == right_nbh)
276  {
277  // Distance between the current line and the right neighbor
278  const float right_distance = lines(l).meanline() - lines(right_nbh).baseline();
279  // Distance between the right neighbor and the ror
280  const float ror_distance = lines(right_nbh).meanline() - lines(ror_nbh).baseline();
281  // ror x_height
282  const float ror_x_height = lines(ror_nbh).x_height();
283 
284  // Conditions to cut the link between the current line
285  // and its right neighbor
286  if (right_distance > 1.4f * ror_distance
287  && std::max(ror_x_height, right_x_height) <
288  1.4f * std::min(ror_x_height, right_x_height)
289  && output(right_nbh) == l)
290  {
291  output(right_nbh) = right_nbh;
292  continue;
293  }
294  }
295  // Otherwise we only have a group of two lines
296  else
297  {
298  // We determine the distance between the two lines
299  const float distance = lines(l).meanline() - lines(right_nbh).baseline();
300  two_lines = true;
301 
302  // If the distance between the two lines is greater than
303  // the minimum x height of the two lines then we cut the
304  // link between them
305  if (distance > 2.0f * std::min(x_height, right_x_height)
306  && output(right_nbh) == l)
307  {
308  output(right_nbh) = right_nbh;
309  continue;
310  }
311  }
312 
313  // Lines features
314  const float min_x_height = std::min(x_height, right_x_height);
315  const float max_x_height = std::max(x_height, right_x_height);
316  const float min_char_width = std::min(rline_cw, cline_cw);
317  const float max_char_width = std::max(rline_cw, cline_cw);
318 
319  // Condition to cut the link between the current line and
320  // its right neighbor
321  if ((max_x_height > min_x_height * 1.4f) &&
322  !(max_char_width <= 1.2f * min_char_width))
323  {
324  if (output(right_nbh) == l)
325  {
326  output(right_nbh) = right_nbh;
327  continue;
328  }
329  }
330 
331  // If we only have two lines we stop the study
332  if (two_lines)
333  continue;
334  }
335  // If the current line has no right neighbor
336  else if (rc_baseline == 0)
337  {
338  // lol : left neighbor of the left neighbor
339 
340  // If the left neighbor of the current line has a left neighbor
341  if (lol_nbh != left_nbh)
342  {
343  // Distance between the current line and its left neighbor
344  const float left_distance = lines(left_nbh).meanline() -
345  lines(l).baseline();
346  // Distance between the left neighbor and the left
347  // neighbor of its left neighbor
348  const float lol_distance = lines(lol_nbh).meanline() -
349  lines(left_nbh).baseline();
350  // lol x height
351  const float lol_x_height = lines(lol_nbh).x_height();
352 
353  // Conditions to cut the link between the current line
354  // and its left neighbor
355  if (left_distance > 1.4f * lol_distance
356  && std::max(lol_x_height, left_x_height) <
357  1.4f * std::min(lol_x_height, left_x_height))
358  {
359  output(l) = l;
360  continue;
361  }
362  }
363  // Otherwise we only have a group of two lines
364  else
365  {
366  // Distance between the current line and it left neighbor
367  const float distance = lines(left_nbh).meanline() -
368  lines(l).baseline();
369 
370  two_lines = true;
371 
372  // If the distance is greater than the min x height
373  // between the two lines
374  if (distance > 2.0f * std::min(x_height, left_x_height))
375  {
376  output(l) = l;
377  continue;
378  }
379  }
380 
381  // Lines features
382  const float min_x_height = std::min(x_height, left_x_height);
383  const float max_x_height = std::max(x_height, left_x_height);
384  const float min_char_width = std::min(lline_cw, cline_cw);
385  const float max_char_width = std::max(lline_cw, cline_cw);
386 
387  // Condition to cut the link between the current line and
388  // its left neighbor
389  if ((max_x_height > min_x_height * 1.4f) &&
390  !(max_char_width <= 1.2f * min_char_width))
391  {
392  output(l) = l;
393  continue;
394  }
395 
396  // If we only have two lines we stop the study
397  if (two_lines)
398  continue;
399  }
400  // The current line has at least one left and one right neighbor
401  else // if (delta_baseline_max >= 1.1 * delta_baseline_min)
402  {
403  // Distance between the left and the current line
404  const float
405  left_distance = left_line_bbox.pcenter().row() - current_line_bbox.pcenter().row();
406  // Distance between the right and the current line
407  const float
408  right_distance = current_line_bbox.pcenter().row() - right_line_bbox.pcenter().row();;
409 
410  // If the left line is too far compared to the right one
411  // we cut the link with it
412  if (left_distance > 1.5f * right_distance
413  && std::max(x_height, left_x_height) > 1.2f * std::min(x_height, left_x_height))
414  {
415  output(l) = l;
416  continue;
417  }
418  // If the right line is too far compared to the left one
419  // we cut the link with it
420  else if (right_distance > 1.5f * left_distance
421  && std::max(x_height, right_x_height) >= 1.2f * std::min(x_height, right_x_height)
422  && output(right_nbh) == l)
423  {
424  output(right_nbh) = right_nbh;
425  continue;
426  }
427 
428  // If the distance between the baseline of the left
429  // neighbor and the baseline of the current line is
430  // greater than the one between the current line baseline
431  // and the right line baseline we have to study the texte
432  // features of the right and left lines
433  if (lc_baseline > rc_baseline)
434  {
435  const float cw_max = std::max(lline_cw, cline_cw);
436  const float cw_min = std::min(lline_cw, cline_cw);
437  const float min_x_height = std::min(x_height, left_x_height);
438  const float max_x_height = std::max(x_height, left_x_height);
439 
440  if ((max_x_height > min_x_height * 1.4f) &&
441  !(cw_max <= 1.2f * cw_min))
442  {
443  output(l) = l;
444  continue;
445  }
446 
447  {
448  const float min_x_height = std::min(x_height, right_x_height);
449  const float max_x_height = std::max(x_height, right_x_height);
450  const float cw_max = std::max(rline_cw, cline_cw);
451  const float cw_min = std::min(rline_cw, cline_cw);
452 
453  if ((max_x_height > min_x_height * 1.4f)
454  && !(cw_max <= 1.2f * cw_min)
455  && output(right_nbh) == l)
456  {
457  output(right_nbh) = right_nbh;
458  continue;
459  }
460  }
461  }
462  else
463  {
464  const float cw_max = std::max(rline_cw, cline_cw);
465  const float cw_min = std::min(rline_cw, cline_cw);
466  const float min_x_height = std::min(x_height, right_x_height);
467  const float max_x_height = std::max(x_height, right_x_height);
468 
469  if ((max_x_height > min_x_height * 1.4f)
470  && !(cw_max <= 1.2f * cw_min)
471  && output(right_nbh) == l)
472  {
473  output(right_nbh) = right_nbh;
474  continue;
475  }
476 
477  {
478  const float min_x_height = std::min(x_height, left_x_height);
479  const float max_x_height = std::max(x_height, left_x_height);
480  const float cw_max = std::max(lline_cw, cline_cw);
481  const float cw_min = std::min(lline_cw, cline_cw);
482 
483  if ((max_x_height > min_x_height * 1.4f)
484  && !(cw_max <= 1.2f * cw_min))
485  {
486  output(l) = l;
487  continue;
488  }
489  }
490  }
491  }
492  }
493 
494  // If we arrive here, it means than the lines in the
495  // neighborhood of the current line are quite similar. We can
496  // then begin to study the indentations in order to determine
497  // the beginning of new paragraphs
498 
499 //-----------------------------------------------------------------------------
500 // ___________________________
501 // |___________________________|
502 // ________________________
503 // |________________________|
504 // ___________________________
505 // |___________________________|
506 // ___________________________
507 // |___________________________|
508 //
509 // Simple case : paragraphs are justified on the left. We try to find any
510 // indentation like above.
511 //
512 //-----------------------------------------------------------------------------
513 
514  {
515  // Check if the current line neighbors are aligned
516  bool left_right_aligned = false;
517  bool left_lol_aligned = false;
518  const int dx_lr = std::abs(lline_col_min - rline_col_min);
519  const int dx_llol = std::abs(lline_col_min - lolline_col_min);
520 
521  if (dx_lr < delta_alignment)
522  left_right_aligned = true;
523 
524  if (dx_llol < delta_alignment)
525  left_lol_aligned = true;
526 
527  if (left_right_aligned && left_lol_aligned)
528  {
529  const int left_right_col_min = std::min(lline_col_min, rline_col_min);
530  const int dx_lrc = std::abs(left_right_col_min - cline_col_min);
531  const float l_char_width = 1.5f * lines(l).char_width();
532 
533  if (dx_lrc > l_char_width &&
534  dx_lrc < 3.0f * l_char_width &&
535  cline_col_min > rline_col_min &&
536  cline_col_min > lline_col_min)
537  {
538  const line_id_t out_right_nbh = output(right_nbh);
539 
540  if (out_right_nbh != l)
541  {
542  if (output(out_right_nbh) == l)
543  output(out_right_nbh) = out_right_nbh;
544  right(l) = l;
545  }
546  else
547  output(right_nbh) = right_nbh;
548  continue;
549  }
550  }
551  }
552 
553 //-----------------------------------------------------------------------------
554 // ___________________________
555 // |___________________________|
556 // ___________________
557 // |___________________| End of the paragraph - Current line
558 // ________________________
559 // |________________________| Beginning of a new one
560 // ___________________________
561 // |___________________________| Left of left of current line
562 //
563 // End of paragraph case : we try to find an end to the current paragraph
564 //
565 //-----------------------------------------------------------------------------
566 
567  {
568  // Check if the current line neighbors are aligned
569  bool left_right_max_aligned = false;
570  bool left_current_min_aligned = false;
571  bool lol_current_min_aligned = false;
572  const bool lol_is_left = output(left_nbh) == left_nbh;
573  const int dx_lr_max = std::abs(lline_col_max - rline_col_max);
574  const int dx_lc_min = std::abs(lline_col_min - cline_col_min);
575  const int dx_lolc_min = std::abs(lolline_col_min - cline_col_min);
576 
577  if (dx_lr_max < delta_alignment)
578  left_right_max_aligned = true;
579 
580  if (dx_lc_min < delta_alignment)
581  left_current_min_aligned = true;
582 
583  if (dx_lolc_min < delta_alignment)
584  lol_current_min_aligned = true;
585 
586  if (!left_current_min_aligned && left_right_max_aligned &&
587  (lol_current_min_aligned || lol_is_left))
588  {
589  const int dx_lrc = std::abs(lline_col_max - cline_col_max);
590  const int l_char_width = lines(l).char_width();
591  const int dx_indent = std::abs(std::max(lline_col_min,
592  rline_col_min) - cline_col_min);
593 
594  if (dx_lrc > l_char_width &&
595  dx_indent < 4 * delta_alignment &&
596  cline_col_max < lline_col_max &&
597  cline_col_min < lline_col_min &&
598  (lline_col_min > lolline_col_min || lol_is_left))
599  {
600  output(l) = l;
601  continue;
602  }
603  }
604  }
605 
606 //-----------------------------------------------------------------------------
607 // ___________________________
608 // |___________________________|
609 // ___________________________
610 // |___________________________|
611 // ________________________
612 // |________________________|
613 // ___________________________
614 // |___________________________|
615 //
616 // Simple case : paragraphs are justified on the left. We try to find any
617 // indentation like above.
618 //
619 //-----------------------------------------------------------------------------
620 
621  {
622  const line_id_t ror_nbh = right(right_nbh);
623  const box2d& ror_line_bbox = lines(ror_nbh).bbox();
624  const int rorline_col_min = ror_line_bbox.pmin().col();
625 
626  bool right_ror_min_aligned = false;
627  bool left_right_aligned = false;
628  const int dx_lr = std::abs(lline_col_min - rline_col_min);
629  const int dx_rror_min = std::abs(rline_col_min - rorline_col_min);
630 
631  if (dx_rror_min < delta_alignment)
632  right_ror_min_aligned = true;
633 
634  if (dx_lr < delta_alignment)
635  left_right_aligned = true;
636 
637  if (right_ror_min_aligned && left_right_aligned &&
638  ror_nbh != right_nbh)
639  {
640  const int left_right_col_min = std::min(lline_col_min, rline_col_min);
641  const int dx_lrc = std::abs(left_right_col_min - cline_col_min);
642  const float l_char_width = 1.5f * lines(l).char_width();
643 
644  if (dx_lrc > l_char_width &&
645  !may_have_another_left_link(right, right_nbh, l, lines) &&
646  dx_lrc < 10.0f * l_char_width &&
647  cline_col_min > rline_col_min &&
648  cline_col_min > lline_col_min)
649  {
650  const line_id_t out_right_nbh = output(right_nbh);
651 
652  if (out_right_nbh != l)
653  {
654  if (output(out_right_nbh) == l)
655  output(out_right_nbh) = out_right_nbh;
656  right(l) = l;
657  }
658  else
659  output(right_nbh) = right_nbh;
660  continue;
661  }
662  }
663  }
664 
665 //-----------------------------------------------------------------------------
666 // ___________________________
667 // |___________________________|
668 // ___________
669 // |___________|
670 // ________________________
671 // |________________________|
672 // ___________________________
673 // |___________________________|
674 //
675 // Simple case : paragraphs are justified on the left. We try to find any
676 // indentation like above.
677 //
678 //-----------------------------------------------------------------------------
679 
680  {
681  const line_id_t ror_nbh = right(right_nbh);
682  const box2d& ror_line_bbox = lines(ror_nbh).bbox();
683  const int rorline_col_min = ror_line_bbox.pmin().col();
684 
685  bool left_ror_aligned = false;
686  const int dx_lror = std::abs(lline_col_min - rorline_col_min);
687 
688  if (dx_lror < delta_alignment)
689  left_ror_aligned = true;
690 
691  if (left_ror_aligned)
692  {
693  const int left_ror_col_min = std::min(lline_col_min, rorline_col_min);
694  const int dx_lrorc = std::abs(left_ror_col_min - cline_col_min);
695  const float l_char_width = 1.5f * lines(l).char_width();
696  const int dx_lrorr = std::abs(left_ror_col_min - rline_col_min);
697  const int dx_crmax = std::abs(rline_col_max - cline_col_max);
698 
699  if (dx_lrorc > l_char_width &&
700  dx_lrorr > 5 * l_char_width &&
701  dx_lrorr > dx_lrorc &&
702  dx_crmax > 5 * l_char_width &&
703  !may_have_another_left_link(right, right_nbh, l, lines) &&
704  dx_lrorc < 10.0f * l_char_width &&
705  cline_col_min > rorline_col_min &&
706  cline_col_min > lline_col_min)
707  {
708  right(right_nbh) = right_nbh;
709  continue;
710  }
711  }
712  }
713 
714 
715 // Strange case
716  {
717  if (rline_col_min > current_line_bbox.pcenter().col()
718  && !may_have_another_left_link(right, right_nbh, l, lines)
719  && cline_col_max < rline_col_max
720  && output(right_nbh) == l)
721  {
722  output(right_nbh) = right_nbh;
723  }
724  }
725 
726 //-----------------------------------------------------------------------------
727 // ___________________________
728 // |___________________________|
729 // ___________________________
730 // |___________________________|
731 // ________________________
732 // |________________________|
733 //
734 // Simple case : paragraphs are justified on the left. We try to find any
735 // indentation like above at the end of a column.
736 //
737 //-----------------------------------------------------------------------------
738 
739  if (left_nbh == l)
740  {
741  const line_id_t ror_nbh = right(right_nbh);
742  const box2d& ror_line_bbox = lines(ror_nbh).bbox();
743  const int rorline_col_min = ror_line_bbox.pmin().col();
744 
745  bool right_ror_min_aligned = false;
746  const int dx_rror_min = std::abs(rline_col_min - rorline_col_min);
747 
748  if (dx_rror_min < delta_alignment)
749  right_ror_min_aligned = true;
750 
751  if (right_ror_min_aligned)
752  {
753  const int right_ror_col_min = std::min(rline_col_min, rorline_col_min);
754  const int dx_rrorc = std::abs(right_ror_col_min - cline_col_min);
755  const float l_char_width = 1.5f * lines(l).char_width();
756 
757  if (dx_rrorc > l_char_width &&
758  dx_rrorc < 10.0f * l_char_width &&
759  cline_col_min > rline_col_min &&
760  cline_col_max >= rline_col_max)
761  {
762  const line_id_t out_right_nbh = output(right_nbh);
763 
764  if (out_right_nbh != l)
765  {
766  if (output(out_right_nbh) == l)
767  output(out_right_nbh) = out_right_nbh;
768  right(l) = l;
769  }
770  else
771  output(right_nbh) = right_nbh;
772  continue;
773  }
774  }
775  }
776  }
777  }
778 
779 //-------------------------------------------------------------
780 // Preparation of the lines before linking them.
781 // For each line we draw the top and the bottom of it.
782 // Assuming than i is the number of the line. Then the top of the line
783 // will be affected with the value 2 * i in the block image and the
784 // bottom with 2 * i + 1.
785 //
786 //-------------------------------------------------------------
787 
788  template <typename L>
789  inline
790  void prepare_lines(const box2d& domain,
791  const line_set<L>& lines,
792  L& blocks,
794  {
795  std::map< int, std::vector< const box2d* > > drawn_lines;
796  // const unsigned nlines = lines.nelements();
797 
798  // For each line
799  //for (unsigned l = 0; l < nlines; ++l)
800  for_all_lines(l, lines)
801  if (lines(l).is_textline())
802  {
803  // Rotation of the bounding box
804  box2d b = geom::rotate(lines(l).bbox(), -90, domain.pcenter());
805 // rbbox.append(b);
806  rbbox(l) = b;
807 
808  const unsigned index = l + 1;
809  const unsigned even_index = 2 * index;
810  const unsigned odd_index = even_index + 1;
811 
812  // Top of the line
813  {
814  bool not_finished = true;
815  int col_offset = 0;
816 
817  while (not_finished)
818  {
819  // Looking for a column in the image to draw the top of the
820  // line
821 
822  const int col = b.pmax().col() + col_offset;
823  std::map< int, std::vector< const box2d* > >::iterator it
824  = drawn_lines.find(col);
825 
826  if (it != drawn_lines.end())
827  {
828  const std::vector< const box2d* >& lines = (*it).second;
829  const unsigned nb_lines = lines.size();
830  unsigned i = 0;
831 
832  for (i = 0; i < nb_lines; ++i)
833  {
834  const box2d* box = lines[i];
835  const int min_row = std::max(b.pmin().row(), box->pmin().row());
836  const int max_row = std::min(b.pmax().row(), box->pmax().row());
837 
838  if (min_row - max_row <= 0)
839  break;
840  }
841 
842  if (i == nb_lines)
843  {
844  mln::draw::line(blocks, point2d(b.pmin().row(), col),
845  point2d(b.pmax().row(), col), even_index);
846  not_finished = false;
847  drawn_lines[col].push_back(&(rbbox[l]));
848  }
849  else
850  ++col_offset;
851  }
852  else
853  {
854  mln::draw::line(blocks, point2d(b.pmin().row(), col),
855  point2d(b.pmax().row(), col), even_index);
856  not_finished = false;
857  drawn_lines[col].push_back(&(rbbox[l]));
858  }
859  }
860  }
861 
862  // Bottom of the line
863  {
864  bool not_finished = true;
865  int col_offset = 0;
866 
867  while (not_finished)
868  {
869  // Looking for a column in the image to draw the bottom of
870  // the line
871 
872  const int col = b.pmin().col() - col_offset;
873  std::map< int, std::vector< const box2d* > >::iterator it
874  = drawn_lines.find(col);
875 
876  if (it != drawn_lines.end())
877  {
878  const std::vector< const box2d* >& lines = (*it).second;
879  const unsigned nb_lines = lines.size();
880  unsigned i = 0;
881 
882  for (i = 0; i < nb_lines; ++i)
883  {
884  const box2d* box = lines[i];
885  const int min_row = std::max(b.pmin().row(), box->pmin().row());
886  const int max_row = std::min(b.pmax().row(), box->pmax().row());
887 
888  if (min_row - max_row <= 0)
889  break;
890  }
891 
892  if (i == nb_lines)
893  {
894  mln::draw::line(blocks, point2d(b.pmin().row(), col),
895  point2d(b.pmax().row(), col), odd_index);
896  not_finished = false;
897  drawn_lines[col].push_back(&(rbbox[l]));
898  }
899  else
900  ++col_offset;
901  }
902  else
903  {
904  mln::draw::line(blocks, point2d(b.pmin().row(), col),
905  point2d(b.pmax().row(), col), odd_index);
906  not_finished = false;
907  drawn_lines[col].push_back(&(rbbox[l]));
908  }
909  }
910  }
911  }
912  }
913 
914  template <typename L>
915  inline
916  void
917  process_left_link(L& blocks,
918  const mln::util::array<box2d>& rbbox,
919  const line_set<L>& lines,
920  line_links<L>& left)
921  {
922  typedef scribo::def::lbl_type V;
923 
924  // At the beginning each line is its own neighbor
925  for_all_lines(l, lines)
926  if (lines(l).is_textline())
927  left(l) = l;
928  else
929  left(l) = 0;
930 
931  // const unsigned nlines = lines.nelements();
932 
933  // For each line
934  for_all_lines(l, lines)
935  if (lines(l).is_textline())
936  {
937  // Max distance for the line search
938  int dmax = 1.5f * lines(l).x_height();
939 
940  // Starting points in the current line box
941  point2d c = rbbox(l).pcenter();
942  point2d q(rbbox(l).pmin().row() + ((c.row() - rbbox(l).pmin().row()) / 4), c.col());
943 
944  int
945  midcol = (rbbox(l).pmax().col()
946  - rbbox(l).pmin().col()) / 2;
947 
948  // Left
949  {
950  // marge gauche
951  int
952  nleftima = c.col() - blocks.domain().pmin().col(),
953  // Distance gauche
954  nleft = std::min(nleftima, midcol + dmax);
955 
956  V
957  // Starting points in the box
958  *p = &blocks(c),
959  *p2 = &blocks(q),
960  // End of search
961  *pstop = p - nleft - 1,
962  // Line neighbor
963  *nbh_p = 0;
964 
965  // While we haven't found a neighbor or reached the limit
966  for (; p != pstop; --p, --p2)
967  {
968  if (*p2 != literal::zero // Not the background
969  && ((*p2 % 2) == 0) // Looking for the bottom of a line
970  && left((*p2 >> 1) - 1) != l) // No loops
971  {
972  // Neightbor found, we stop the research
973  nbh_p = p2;
974  break;
975  }
976 
977  if (*p != literal::zero // Not the background
978  && ((*p % 2) == 0) // Looking for the bottom of a line
979  && left((*p >> 1) - 1) != l) // No loops
980  {
981  // Neightbor found, we stop the research
982  nbh_p = p;
983  break;
984  }
985  }
986 
987  // If a neighbor was found, then we have found the top of the
988  // line. We are then looking for the bottom of the encountered
989  // line. If during the search process we find a complete line
990  // included in the touched line, this line is considered as
991  // the neighbor under certain conditions (see below)
992 
993  //---------------------------------------------------------------
994  // _________________________ |
995  // |_________________________| => Current line | Search direction
996  // v
997  // => First encountered top line
998  // __________________________________________________ 2Q
999  // | Q |
1000  // | _________________________ |2P
1001  // | |_____________P___________| => Second top |2P + 1
1002  // | line |
1003  // |__________________________________________________|2Q + 1
1004  //
1005  //
1006  //---------------------------------------------------------------
1007 
1008  if (nbh_p)
1009  {
1010  std::vector<V> lines_nbh;
1011  const V end_p = *nbh_p + 1;
1012  const V* nbh_p_copy = nbh_p;
1013 
1014  for (; *nbh_p != end_p; --nbh_p)
1015  {
1016  if ((*nbh_p) != literal::zero) // Not the background
1017  {
1018  if ((*nbh_p) % 2 == 0)// We have found the top of
1019  // another line
1020  lines_nbh.push_back(*nbh_p);
1021  else
1022  {
1023  // We have found the bottom of a line. We are looking if
1024  // we have already encountered the top of this
1025  // line. If so, we link the current line with this one
1026  // under certain conditions:
1027 
1028  if (std::find(lines_nbh.begin(), lines_nbh.end(),
1029  (*nbh_p) - 1) != lines_nbh.end())
1030  {
1031  // If we can link the complete line with the current line
1032  if (// It must be in the search range
1033  nbh_p > pstop
1034  // Avoid loops
1035  && left(((*nbh_p - 1) >> 1) - 1) != l)
1036  left(l) = ((*nbh_p - 1) >> 1) - 1;
1037 
1038  // We have found a complete line so we stop the search
1039  break;
1040  }
1041  }
1042  }
1043  }
1044 
1045 
1046  // If we haven't found any included line in the first
1047  // neighbor, then the line is considered as the neighbor of
1048  // the current line
1049  if (*nbh_p == end_p)
1050  left(l) = (*nbh_p_copy >> 1) - 1;
1051  }
1052  }
1053  }
1054  }
1055 
1056 
1057  // We assume that the lines have been rotated
1058  template <typename L>
1059  inline
1060  void
1061  process_right_link(L& blocks,
1062  const mln::util::array<box2d>& rbbox,
1063  const line_set<L>& lines,
1064  line_links<L>& right)
1065  {
1066  typedef scribo::def::lbl_type V;
1067 
1068  // At the beginning each line is its own neighbor
1069  for_all_lines(l, lines)
1070  if (lines(l).is_textline())
1071  right(l) = l;
1072  else
1073  right(l) = 0;
1074 
1075  // const unsigned nlines = lines.nelements();
1076 
1077  // For each line
1078  for_all_lines(l, lines)
1079  if (lines(l).is_textline())
1080  {
1081  // Max distance for the line search
1082  int dmax = 1.5f * lines(l).x_height();
1083 
1084  // Starting points in the current line box
1085  point2d c = rbbox(l).pcenter();
1086  point2d q(rbbox(l).pmax().row() - ((rbbox(l).pmax().row() - c.row()) / 4), c.col());
1087 
1088  int
1089  midcol = (rbbox(l).pmax().col()
1090  - rbbox(l).pmin().col()) / 2;
1091 
1092  // Right
1093  {
1094  int
1095  nrightima = geom::ncols(blocks) - c.col() + blocks.domain().pmin().col(),
1096  nright = std::min(nrightima, midcol + dmax);
1097 
1098  V
1099  // Starting points in the box
1100  *p = &blocks(c),
1101  *p2 = &blocks(q),
1102  // End of search
1103  *pstop = p + nright - 1,
1104  // Line neighbor
1105  *nbh_p = 0;
1106 
1107  // While we haven't found a neighbor or reached the limit
1108  for (; p != pstop; ++p, ++p2)
1109  {
1110  if (*p2 != literal::zero // Not the background
1111  && ((*p2 % 2) == 1) // Looking for the bottom of a line
1112  && right(((*p2 - 1) >> 1) - 1) != l) // No loops
1113  {
1114  // Neightbor found, we stop the research
1115  nbh_p = p2;
1116  break;
1117  }
1118 
1119  if (*p != literal::zero // Not the background
1120  && ((*p % 2) == 1) // Looking for the bottom of a line
1121  && right(((*p - 1) >> 1) - 1) != l) // No loops
1122  {
1123  // Neightbor found, we stop the research
1124  nbh_p = p;
1125  break;
1126  }
1127  }
1128 
1129  // If a neighbor was found, then we have found the bottom of the
1130  // line. We are then looking for the top of the encountered
1131  // line. If during the search process we find a complete line
1132  // included in the touched line, this line is considered as
1133  // the neighbor under certain conditions (see below)
1134 
1135  //---------------------------------------------------------------
1136  //
1137  //
1138  // __________________________________________________ 2Q
1139  // | Q |
1140  // | _________________________ |2P
1141  // | |_____________P___________| => Second bottom |2P + 1
1142  // | line |
1143  // |__________________________________________________|2Q + 1
1144  // => First encountered bottom line
1145  // _________________________ ^
1146  // |_________________________| => Current line | Search direction
1147  // |
1148  //---------------------------------------------------------------
1149 
1150  if (nbh_p)
1151  {
1152  std::vector<V> lines_nbh;
1153  const V end_p = *nbh_p - 1;
1154  const V* nbh_p_copy = nbh_p;
1155 
1156  for (; *nbh_p != end_p; ++nbh_p)
1157  {
1158  if (*nbh_p != literal::zero) // Not the background
1159  {
1160  if (*nbh_p % 2 == 1) // We have found the bottom of
1161  // another line
1162  lines_nbh.push_back(*nbh_p);
1163  else
1164  {
1165  // We have found the top of a line. We are looking if
1166  //we have already encountered the bottom of this
1167  // line. If so, we link the current line with this one
1168  // under certain conditions:
1169 
1170  if (std::find(lines_nbh.begin(), lines_nbh.end(),
1171  *nbh_p + 1) != lines_nbh.end())
1172  {
1173  // If we can link the complete line with the current line
1174  if (// It must be in the search range
1175  nbh_p < pstop
1176  // Avoid loops
1177  && right((*nbh_p >> 1) - 1) != l)
1178  right(l) = (*nbh_p >> 1) - 1;
1179 
1180  // We have found a complete line, so we stop the search
1181  break;
1182  }
1183  }
1184  }
1185  }
1186 
1187  // If we haven't found any included line in the first
1188  // neighbor, then the line is considered as the neighbor of
1189  // the current line
1190 
1191  if (*nbh_p == end_p)
1192  right(l) = ((*nbh_p_copy - 1) >> 1) - 1;
1193  }
1194  }
1195  }
1196  }
1197 
1198 //-----------------------------------------------------------------------
1199 // Finalizing the links by merging information extracted from the left
1200 // and right links
1201 //-----------------------------------------------------------------------
1202 
1203  template< typename L >
1204  inline
1205  void finalize_links(line_links<L>& left,
1206  line_links<L>& right,
1207  const line_set<L>& lines)
1208  {
1209  // const unsigned nlines = lines.nelements();
1210 
1211  for_all_lines(l, lines)
1212  if (lines(l).is_textline())
1213  {
1214  const unsigned left_value = left(l);
1215  const unsigned right_value = right(l);
1216 
1217  // If the right neighbor of my left neighbor is itself then its
1218  // right neighbor is me
1219  {
1220  line_id_t& v = right(left_value);
1221 
1222  if (v == left_value)
1223  v = l;
1224  }
1225 
1226  // If the left neighbor of my right neighbor is itself then its
1227  // left neighbor is me
1228  {
1229  line_id_t& v = left(right_value);
1230 
1231  if (v == right_value)
1232  v = l;
1233  }
1234  }
1235  }
1236 
1237  } // end of namespace scribo::text::internal
1238 
1239 
1240  template <typename L>
1241  paragraph_set<L>
1242  extract_paragraphs_hdoc(line_set<L>& lines,
1243  const image2d<bool>& input)
1244  {
1245  typedef scribo::def::lbl_type V;
1246 
1247  image2d<V> blocks(geom::rotate(input.domain(), -90, input.domain().pcenter()));
1248  data::fill(blocks, 0);
1249 
1250  // util::array< line_info<L> > lines_info;
1251 
1252  // for_all_lines(l, lines)
1253  // {
1254  // if (lines(l).is_textline())
1255  // lines_info.append(lines(l));
1256  // }
1257 
1260  line_links<L> left(lines);
1261  left(0) = 0;
1262  line_links<L> right(lines);
1263  right(0) = 0;
1264  line_links<L> output(lines);
1265  output(0) = 0;
1266 
1267  rbbox.resize(lines.nelements() + 1);
1268 
1269  internal::prepare_lines(input.domain(), lines , blocks, rbbox);
1270  internal::process_left_link(blocks, rbbox, lines , left);
1271  internal::process_right_link(blocks, rbbox, lines , right);
1272  internal::finalize_links(left, right, lines );
1273  // finalize_line_merging(left, right, lines);
1274  internal::paragraph_links(left, right, output, lines);
1275 
1276  paragraph_set<L> par_set = make::paragraph(output, right);
1277  return par_set;
1278  }
1279 
1280 # endif // ! MLN_INCLUDE_ONLY
1281 
1282  } // end of namespace scribo::text
1283 
1284 } // end of namespace scribo
1285 
1286 #endif // ! SCRIBO_TEXT_EXTRACT_PARAGRAPHS_HDOC_HH