Lucene++ - a full-featured, c++ search engine
API Documentation


PorterStemmer.h
Go to the documentation of this file.
1 // Copyright (c) 2009-2014 Alan Wright. All rights reserved.
3 // Distributable under the terms of either the Apache License (Version 2.0)
4 // or the GNU Lesser General Public License.
6 
7 #ifndef PORTERSTEMMER_H
8 #define PORTERSTEMMER_H
9 
10 #include "LuceneObject.h"
11 
12 namespace Lucene {
13 
26 class PorterStemmer : public LuceneObject {
27 public:
28  PorterStemmer();
29  virtual ~PorterStemmer();
30 
32 
33 protected:
34  wchar_t* b; // buffer for word to be stemmed
35  int32_t k; // offset to the end of the string
36  int32_t j; // a general offset into the string
37  int32_t i; // initial length of word
38  bool dirty;
39 
40 public:
41  bool stem(CharArray word);
42 
46  bool stem(wchar_t* b, int32_t k);
47 
48  wchar_t* getResultBuffer();
49  int32_t getResultLength();
50 
51 protected:
53  bool cons(int32_t i);
54 
63  int32_t m();
64 
66  bool vowelinstem();
67 
69  bool doublec(int32_t j);
70 
76  bool cvc(int32_t i);
77 
79  bool ends(const wchar_t* s);
80 
82  void setto(const wchar_t* s);
83 
84  void r(const wchar_t* s);
85 
105  void step1ab();
106 
108  void step1c();
109 
112  void step2();
113 
115  void step3();
116 
118  void step4();
119 
121  void step5();
122 };
123 
124 }
125 
126 #endif
int32_t m()
Measures the number of consonant sequences between 0 and j. If c is a consonant sequence and v a vowe...
void step4()
Takes off -ant, -ence etc., in context vcvc.
wchar_t * b
Definition: PorterStemmer.h:31
bool cons(int32_t i)
Returns true if b[i] is a consonant. ('b' means 'z->b', but here and below we drop 'z->' in comments...
bool stem(CharArray word)
void step1ab()
step1ab() gets rid of plurals and -ed or -ing. eg.
void step2()
Maps double suffices to single ones. so -ization ( = -ize plus -ation) maps to -ize etc...
This is the Porter stemming algorithm, coded up as thread-safe ANSI C by the author.
Definition: PorterStemmer.h:26
void setto(const wchar_t *s)
Sets (j+1),...k to the characters in the string s, readjusting k.
bool vowelinstem()
Return true if 0,...j contains a vowel.
int32_t k
Definition: PorterStemmer.h:35
bool dirty
Definition: PorterStemmer.h:38
bool doublec(int32_t j)
Return true if j,(j-1) contain a double consonant.
Base class for all Lucene classes.
Definition: LuceneObject.h:31
#define LUCENE_CLASS(Name)
Definition: LuceneObject.h:24
Definition: AbstractAllTermDocs.h:12
void r(const wchar_t *s)
void step3()
Deals with -ic-, -full, -ness etc. similar strategy to step2.
void step5()
Removes a final -e if m() > 1, and changes -ll to -l if m() > 1.
bool cvc(int32_t i)
Return true if i-2,i-1,i has the form consonant - vowel - consonant and also if the second c is not w...
bool ends(const wchar_t *s)
Returns true if 0,...k ends with the string s.
void step1c()
Turns terminal y to i when there is another vowel in the stem.
wchar_t * getResultBuffer()
int32_t j
Definition: PorterStemmer.h:36
int32_t i
Definition: PorterStemmer.h:37

clucene.sourceforge.net