28 #ifndef SCRIBO_TOOLCHAIN_NEPOMUK_TEXT_EXTRACTION_HH
29 # define SCRIBO_TOOLCHAIN_NEPOMUK_TEXT_EXTRACTION_HH
36 # include <QtCore/QSet>
37 # include <QtCore/QString>
38 # include <QtCore/QStringList>
39 # include <QtCore/QTextStream>
40 # include <QtCore/QTextCodec>
41 # include <QtGui/QImage>
43 # include <mln/core/image/image2d.hh>
44 # include <mln/data/transform.hh>
45 # include <mln/logical/not.hh>
46 # include <mln/value/qt/rgb32.hh>
47 # include <mln/fun/v2v/qt_rgb_to_int_u.hh>
49 # include <scribo/convert/from_qimage.hh>
50 # include <scribo/binarization/sauvola_ms.hh>
51 # include <scribo/preprocessing/deskew.hh>
52 # include <scribo/toolchain/text_in_doc.hh>
88 # ifndef MLN_INCLUDE_ONLY
91 text_extraction(
const QImage& input,
const QString& language = QString(
"eng"))
93 mln_trace(
"scribo::toolchain::nepomuk::text_extraction");
95 mln_precondition(!input.isNull());
101 input_mln = scribo::convert::from_qimage(input);
123 line_set<L> lines_bg, lines_fg;
129 language.toUtf8().data());
137 language.toUtf8().data());
141 QSet<QString> output;
145 QTextCodec *codec = QTextCodec::codecForName(
"UTF-8");
148 QTextStream stream(&tmp_out, QIODevice::WriteOnly);
149 stream.setCodec(
"UTF-8");
151 for_all_lines(l, lines_bg)
152 if (lines_bg(l).has_text())
153 stream << " " << codec->toUnicode(lines_bg(l).text().c_str());
155 for_all_lines(l, lines_fg)
156 if (lines_fg(l).has_text())
157 stream << " " << codec->toUnicode(lines_fg(l).text().c_str())
158 .remove(QRegExp("[\\?!()\\[\\]\\{\\}\\.,;\\\
"\\\'`_]"));
160 QStringList list = tmp_out.split(
' ', QString::SkipEmptyParts);
162 output = QSet<QString>::fromList(list);
168 # endif // ! MLN_INCLUDE_ONLY
177 #endif // ! SCRIBO_TOOLCHAIN_NEPOMUK_TEXT_EXTRACTION_HH