/usr/include/CLucene/analysis/de/GermanStemmer.h is in libclucene-dev 2.3.3.4-4build1.
This file is owned by root:root, with mode 0o644.
The actual contents of the file can be viewed below.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | /*------------------------------------------------------------------------------
* Copyright (C) 2003-2010 Ben van Klinken and the CLucene Team
*
* Distributable under the terms of either the Apache License (Version 2.0) or
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------*/
#ifndef _lucene_analysis_de_GermanStemmer
#define _lucene_analysis_de_GermanStemmer
CL_CLASS_DEF(util,StringBuffer)
CL_NS_DEF2(analysis,de)
/**
* A stemmer for German words. The algorithm is based on the report
* "A Fast and Simple Stemming Algorithm for German Words" by Jörg
* Caumanns (joerg.caumanns at isst.fhg.de).
*/
class CLUCENE_CONTRIBS_EXPORT GermanStemmer
{
private:
/**
* Buffer for the terms while stemming them.
*/
CL_NS(util)::StringBuffer sb;
/**
* Amount of characters that are removed with <tt>substitute()</tt> while stemming.
*/
int substCount;
public:
/**
*/
GermanStemmer();
/**
* Stemms the given term to an unique <tt>discriminator</tt>.
*
* @param term The term that should be stemmed.
* @return Discriminator for <tt>term</tt>
*/
TCHAR* stem(const TCHAR* term, size_t length = -1);
private:
/**
* Checks if a term could be stemmed.
*
* @return true if, and only if, the given term consists in letters.
*/
bool isStemmable(const TCHAR* term, size_t length = -1) const;
/**
* suffix stripping (stemming) on the current term. The stripping is reduced
* to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
* from which all regular suffixes are build of. The simplification causes
* some overstemming, and way more irregular stems, but still provides unique.
* discriminators in the most of those cases.
* The algorithm is context free, except of the length restrictions.
*/
void strip(CL_NS(util)::StringBuffer& buffer);
/**
* Does some optimizations on the term. This optimisations are
* contextual.
*/
void optimize(CL_NS(util)::StringBuffer& buffer);
/**
* Removes a particle denotion ("ge") from a term.
*/
void removeParticleDenotion(CL_NS(util)::StringBuffer& buffer);
/**
* Do some substitutions for the term to reduce overstemming:
*
* - Substitute Umlauts with their corresponding vowel: äöü -> aou,
* "ß" is substituted by "ss"
* - Substitute a second char of a pair of equal characters with
* an asterisk: ?? -> ?*
* - Substitute some common character combinations with a token:
* sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
*/
void substitute(CL_NS(util)::StringBuffer& buffer);
/**
* Undoes the changes made by substitute(). That are character pairs and
* character combinations. Umlauts will remain as their corresponding vowel,
* as "ß" remains as "ss".
*/
void resubstitute(CL_NS(util)::StringBuffer& buffer);
};
CL_NS_END2
#endif
|