$extrastylesheet
Olena  User documentation 2.1
An Image Processing Platform
 All Classes Namespaces Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
text_extraction.hh
1 // Copyright (C) 2010, 2011 EPITA Research and Development Laboratory
2 // (LRDE)
3 //
4 // This file is part of Olena.
5 //
6 // Olena is free software: you can redistribute it and/or modify it under
7 // the terms of the GNU General Public License as published by the Free
8 // Software Foundation, version 2 of the License.
9 //
10 // Olena is distributed in the hope that it will be useful,
11 // but WITHOUT ANY WARRANTY; without even the implied warranty of
12 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 // General Public License for more details.
14 //
15 // You should have received a copy of the GNU General Public License
16 // along with Olena. If not, see <http://www.gnu.org/licenses/>.
17 //
18 // As a special exception, you may use this file as part of a free
19 // software project without restriction. Specifically, if other files
20 // instantiate templates or use macros or inline functions from this
21 // file, or you compile this file and link it with other files to produce
22 // an executable, this file does not by itself cause the resulting
23 // executable to be covered by the GNU General Public License. This
24 // exception does not however invalidate any other reasons why the
25 // executable file might be covered by the GNU General Public License.
26 
27 
28 #ifndef SCRIBO_TOOLCHAIN_NEPOMUK_TEXT_EXTRACTION_HH
29 # define SCRIBO_TOOLCHAIN_NEPOMUK_TEXT_EXTRACTION_HH
30 
34 
35 
36 # include <QtCore/QSet>
37 # include <QtCore/QString>
38 # include <QtCore/QStringList>
39 # include <QtCore/QTextStream>
40 # include <QtCore/QTextCodec>
41 # include <QtGui/QImage>
42 
43 # include <mln/core/image/image2d.hh>
44 # include <mln/data/transform.hh>
45 # include <mln/logical/not.hh>
46 # include <mln/value/qt/rgb32.hh>
47 # include <mln/fun/v2v/qt_rgb_to_int_u.hh>
48 
49 # include <scribo/convert/from_qimage.hh>
50 # include <scribo/binarization/sauvola_ms.hh>
51 # include <scribo/preprocessing/deskew.hh>
52 # include <scribo/toolchain/text_in_doc.hh>
53 
54 
55 namespace scribo
56 {
57 
58  namespace toolchain
59  {
60 
61  namespace nepomuk
62  {
63 
84  QSet<QString>
85  text_extraction(const QImage& input, const QString& language);
86 
87 
88 # ifndef MLN_INCLUDE_ONLY
89 
90  QSet<QString>
91  text_extraction(const QImage& input, const QString& language = QString("eng"))
92  {
93  mln_trace("scribo::toolchain::nepomuk::text_extraction");
94 
95  mln_precondition(!input.isNull());
96 
98 
99  // Convert image to Milena's format.
101  input_mln = scribo::convert::from_qimage(input);
102 
103  image2d<bool> input_bin;
104 
105 
106  // Preprocess
107  {
108  // Convert to Gray level image.
110  input_gl = data::transform(input_mln,
112 
113  // Deskew if needed.
114  input_gl = preprocessing::deskew(input_gl);
115 
116  // Binarize foreground to use it in the processing chain.
117  input_bin = scribo::binarization::sauvola_ms(input_gl, 51, 3);
118  }
119 
120 
121 
122 
123  line_set<L> lines_bg, lines_fg;
124  // Process
125  {
126  // Run document toolchain.
127  lines_bg = scribo::toolchain::text_in_doc(input_bin,
128  true,
129  language.toUtf8().data());
130 
131  // Negate document.
132  logical::not_inplace(input_bin);
133 
134  // Run document toolchain.
135  lines_fg = scribo::toolchain::text_in_doc(input_bin,
136  true,
137  language.toUtf8().data());
138  }
139 
140 
141  QSet<QString> output;
142 
143  // Construct output
144  {
145  QTextCodec *codec = QTextCodec::codecForName("UTF-8");
146 
147  QString tmp_out;
148  QTextStream stream(&tmp_out, QIODevice::WriteOnly);
149  stream.setCodec("UTF-8");
150 
151  for_all_lines(l, lines_bg)
152  if (lines_bg(l).has_text())
153  stream << " " << codec->toUnicode(lines_bg(l).text().c_str());
154 
155  for_all_lines(l, lines_fg)
156  if (lines_fg(l).has_text())
157  stream << " " << codec->toUnicode(lines_fg(l).text().c_str())
158  .remove(QRegExp("[\\?!()\\[\\]\\{\\}\\.,;\\\"\\\'`_]")); // Remove useless punctuation.
159 
160  QStringList list = tmp_out.split(' ', QString::SkipEmptyParts);
161 
162  output = QSet<QString>::fromList(list);
163  }
164 
165  return output;
166  }
167 
168 # endif // ! MLN_INCLUDE_ONLY
169 
170  } // end of namespace scribo::toolchain::nepomuk
171 
172  } // end of namespace scribo::toolchain
173 
174 } // end of namespace scribo
175 
176 
177 #endif // ! SCRIBO_TOOLCHAIN_NEPOMUK_TEXT_EXTRACTION_HH