EVOLUTION-MANAGER
Edit File: german.py
from .bases import _StandardStemmer from whoosh.compat import u class GermanStemmer(_StandardStemmer): """ The German Snowball stemmer. :cvar __vowels: The German vowels. :type __vowels: unicode :cvar __s_ending: Letters that may directly appear before a word final 's'. :type __s_ending: unicode :cvar __st_ending: Letter that may directly appear before a word final 'st'. :type __st_ending: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :note: A detailed description of the German stemming algorithm can be found under http://snowball.tartarus.org/algorithms/german/stemmer.html """ __vowels = u("aeiouy\xE4\xF6\xFC") __s_ending = "bdfghklmnrt" __st_ending = "bdfghklmnt" __step1_suffixes = ("ern", "em", "er", "en", "es", "e", "s") __step2_suffixes = ("est", "en", "er", "st") __step3_suffixes = ("isch", "lich", "heit", "keit", "end", "ung", "ig", "ik") def stem(self, word): """ Stem a German word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() word = word.replace(u("\xDF"), "ss") # Every occurrence of 'u' and 'y' # between vowels is put into upper case. for i in range(1, len(word) - 1): if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: if word[i] == "u": word = "".join((word[:i], "U", word[i + 1:])) elif word[i] == "y": word = "".join((word[:i], "Y", word[i + 1:])) r1, r2 = self._r1r2_standard(word, self.__vowels) # R1 is adjusted so that the region before it # contains at least 3 letters. for i in range(1, len(word)): if word[i] not in self.__vowels and word[i - 1] in self.__vowels: if len(word[:i + 1]) < 3 and len(word[:i + 1]) > 0: r1 = word[3:] elif len(word[:i + 1]) == 0: return word break # STEP 1 for suffix in self.__step1_suffixes: if r1.endswith(suffix): if (suffix in ("en", "es", "e") and word[-len(suffix) - 4:-len(suffix)] == "niss"): word = word[:-len(suffix) - 1] r1 = r1[:-len(suffix) - 1] r2 = r2[:-len(suffix) - 1] elif suffix == "s": if word[-2] in self.__s_ending: word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] break # STEP 2 for suffix in self.__step2_suffixes: if r1.endswith(suffix): if suffix == "st": if word[-3] in self.__st_ending and len(word[:-3]) >= 3: word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] break # STEP 3: Derivational suffixes for suffix in self.__step3_suffixes: if r2.endswith(suffix): if suffix in ("end", "ung"): if ("ig" in r2[-len(suffix) - 2:-len(suffix)] and "e" not in r2[-len(suffix) - 3:-len(suffix) - 2]): word = word[:-len(suffix) - 2] else: word = word[:-len(suffix)] elif (suffix in ("ig", "ik", "isch") and "e" not in r2[-len(suffix) - 1:-len(suffix)]): word = word[:-len(suffix)] elif suffix in ("lich", "heit"): if ("er" in r1[-len(suffix) - 2:-len(suffix)] or "en" in r1[-len(suffix) - 2:-len(suffix)]): word = word[:-len(suffix) - 2] else: word = word[:-len(suffix)] elif suffix == "keit": if "lich" in r2[-len(suffix) - 4:-len(suffix)]: word = word[:-len(suffix) - 4] elif "ig" in r2[-len(suffix) - 2:-len(suffix)]: word = word[:-len(suffix) - 2] else: word = word[:-len(suffix)] break # Umlaut accents are removed and # 'u' and 'y' are put back into lower case. word = (word.replace(u("\xE4"), "a").replace(u("\xF6"), "o") .replace(u("\xFC"), "u").replace("U", "u") .replace("Y", "y")) return word