EVOLUTION-MANAGER
Edit File: russian.py
from whoosh.compat import u class RussianStemmer(object): """ The Russian Snowball stemmer. :cvar __perfective_gerund_suffixes: Suffixes to be deleted. :type __perfective_gerund_suffixes: tuple :cvar __adjectival_suffixes: Suffixes to be deleted. :type __adjectival_suffixes: tuple :cvar __reflexive_suffixes: Suffixes to be deleted. :type __reflexive_suffixes: tuple :cvar __verb_suffixes: Suffixes to be deleted. :type __verb_suffixes: tuple :cvar __noun_suffixes: Suffixes to be deleted. :type __noun_suffixes: tuple :cvar __superlative_suffixes: Suffixes to be deleted. :type __superlative_suffixes: tuple :cvar __derivational_suffixes: Suffixes to be deleted. :type __derivational_suffixes: tuple :note: A detailed description of the Russian stemming algorithm can be found under http://snowball.tartarus.org/algorithms/russian/stemmer.html """ __perfective_gerund_suffixes = ("ivshis'", "yvshis'", "vshis'", "ivshi", "yvshi", "vshi", "iv", "yv", "v") __adjectival_suffixes = ('ui^ushchi^ui^u', 'ui^ushchi^ai^a', 'ui^ushchimi', 'ui^ushchymi', 'ui^ushchego', 'ui^ushchogo', 'ui^ushchemu', 'ui^ushchomu', 'ui^ushchikh', 'ui^ushchykh', 'ui^ushchui^u', 'ui^ushchaia', 'ui^ushchoi^u', 'ui^ushchei^u', 'i^ushchi^ui^u', 'i^ushchi^ai^a', 'ui^ushchee', 'ui^ushchie', 'ui^ushchye', 'ui^ushchoe', 'ui^ushchei`', 'ui^ushchii`', 'ui^ushchyi`', 'ui^ushchoi`', 'ui^ushchem', 'ui^ushchim', 'ui^ushchym', 'ui^ushchom', 'i^ushchimi', 'i^ushchymi', 'i^ushchego', 'i^ushchogo', 'i^ushchemu', 'i^ushchomu', 'i^ushchikh', 'i^ushchykh', 'i^ushchui^u', 'i^ushchai^a', 'i^ushchoi^u', 'i^ushchei^u', 'i^ushchee', 'i^ushchie', 'i^ushchye', 'i^ushchoe', 'i^ushchei`', 'i^ushchii`', 'i^ushchyi`', 'i^ushchoi`', 'i^ushchem', 'i^ushchim', 'i^ushchym', 'i^ushchom', 'shchi^ui^u', 'shchi^ai^a', 'ivshi^ui^u', 'ivshi^ai^a', 'yvshi^ui^u', 'yvshi^ai^a', 'shchimi', 'shchymi', 'shchego', 'shchogo', 'shchemu', 'shchomu', 'shchikh', 'shchykh', 'shchui^u', 'shchai^a', 'shchoi^u', 'shchei^u', 'ivshimi', 'ivshymi', 'ivshego', 'ivshogo', 'ivshemu', 'ivshomu', 'ivshikh', 'ivshykh', 'ivshui^u', 'ivshai^a', 'ivshoi^u', 'ivshei^u', 'yvshimi', 'yvshymi', 'yvshego', 'yvshogo', 'yvshemu', 'yvshomu', 'yvshikh', 'yvshykh', 'yvshui^u', 'yvshai^a', 'yvshoi^u', 'yvshei^u', 'vshi^ui^u', 'vshi^ai^a', 'shchee', 'shchie', 'shchye', 'shchoe', 'shchei`', 'shchii`', 'shchyi`', 'shchoi`', 'shchem', 'shchim', 'shchym', 'shchom', 'ivshee', 'ivshie', 'ivshye', 'ivshoe', 'ivshei`', 'ivshii`', 'ivshyi`', 'ivshoi`', 'ivshem', 'ivshim', 'ivshym', 'ivshom', 'yvshee', 'yvshie', 'yvshye', 'yvshoe', 'yvshei`', 'yvshii`', 'yvshyi`', 'yvshoi`', 'yvshem', 'yvshim', 'yvshym', 'yvshom', 'vshimi', 'vshymi', 'vshego', 'vshogo', 'vshemu', 'vshomu', 'vshikh', 'vshykh', 'vshui^u', 'vshai^a', 'vshoi^u', 'vshei^u', 'emi^ui^u', 'emi^ai^a', 'nni^ui^u', 'nni^ai^a', 'vshee', 'vshie', 'vshye', 'vshoe', 'vshei`', 'vshii`', 'vshyi`', 'vshoi`', 'vshem', 'vshim', 'vshym', 'vshom', 'emimi', 'emymi', 'emego', 'emogo', 'ememu', 'emomu', 'emikh', 'emykh', 'emui^u', 'emai^a', 'emoi^u', 'emei^u', 'nnimi', 'nnymi', 'nnego', 'nnogo', 'nnemu', 'nnomu', 'nnikh', 'nnykh', 'nnui^u', 'nnai^a', 'nnoi^u', 'nnei^u', 'emee', 'emie', 'emye', 'emoe', 'emei`', 'emii`', 'emyi`', 'emoi`', 'emem', 'emim', 'emym', 'emom', 'nnee', 'nnie', 'nnye', 'nnoe', 'nnei`', 'nnii`', 'nnyi`', 'nnoi`', 'nnem', 'nnim', 'nnym', 'nnom', 'i^ui^u', 'i^ai^a', 'imi', 'ymi', 'ego', 'ogo', 'emu', 'omu', 'ikh', 'ykh', 'ui^u', 'ai^a', 'oi^u', 'ei^u', 'ee', 'ie', 'ye', 'oe', 'ei`', 'ii`', 'yi`', 'oi`', 'em', 'im', 'ym', 'om') __reflexive_suffixes = ("si^a", "s'") __verb_suffixes = ("esh'", 'ei`te', 'ui`te', 'ui^ut', "ish'", 'ete', 'i`te', 'i^ut', 'nno', 'ila', 'yla', 'ena', 'ite', 'ili', 'yli', 'ilo', 'ylo', 'eno', 'i^at', 'uet', 'eny', "it'", "yt'", 'ui^u', 'la', 'na', 'li', 'em', 'lo', 'no', 'et', 'ny', "t'", 'ei`', 'ui`', 'il', 'yl', 'im', 'ym', 'en', 'it', 'yt', 'i^u', 'i`', 'l', 'n') __noun_suffixes = ('ii^ami', 'ii^akh', 'i^ami', 'ii^am', 'i^akh', 'ami', 'iei`', 'i^am', 'iem', 'akh', 'ii^u', "'i^u", 'ii^a', "'i^a", 'ev', 'ov', 'ie', "'e", 'ei', 'ii', 'ei`', 'oi`', 'ii`', 'em', 'am', 'om', 'i^u', 'i^a', 'a', 'e', 'i', 'i`', 'o', 'u', 'y', "'") __superlative_suffixes = ("ei`she", "ei`sh") __derivational_suffixes = ("ost'", "ost") def stem(self, word): """ Stem a Russian word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ chr_exceeded = False for i in range(len(word)): if ord(word[i]) > 255: chr_exceeded = True break if chr_exceeded: word = self.__cyrillic_to_roman(word) step1_success = False adjectival_removed = False verb_removed = False undouble_success = False superlative_removed = False rv, r2 = self.__regions_russian(word) # Step 1 for suffix in self.__perfective_gerund_suffixes: if rv.endswith(suffix): if suffix in ("v", "vshi", "vshis'"): if (rv[-len(suffix) - 3:-len(suffix)] == "i^a" or rv[-len(suffix) - 1:-len(suffix)] == "a"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] step1_success = True break else: word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] step1_success = True break if not step1_success: for suffix in self.__reflexive_suffixes: if rv.endswith(suffix): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] break for suffix in self.__adjectival_suffixes: if rv.endswith(suffix): if suffix in ('i^ushchi^ui^u', 'i^ushchi^ai^a', 'i^ushchui^u', 'i^ushchai^a', 'i^ushchoi^u', 'i^ushchei^u', 'i^ushchimi', 'i^ushchymi', 'i^ushchego', 'i^ushchogo', 'i^ushchemu', 'i^ushchomu', 'i^ushchikh', 'i^ushchykh', 'shchi^ui^u', 'shchi^ai^a', 'i^ushchee', 'i^ushchie', 'i^ushchye', 'i^ushchoe', 'i^ushchei`', 'i^ushchii`', 'i^ushchyi`', 'i^ushchoi`', 'i^ushchem', 'i^ushchim', 'i^ushchym', 'i^ushchom', 'vshi^ui^u', 'vshi^ai^a', 'shchui^u', 'shchai^a', 'shchoi^u', 'shchei^u', 'emi^ui^u', 'emi^ai^a', 'nni^ui^u', 'nni^ai^a', 'shchimi', 'shchymi', 'shchego', 'shchogo', 'shchemu', 'shchomu', 'shchikh', 'shchykh', 'vshui^u', 'vshai^a', 'vshoi^u', 'vshei^u', 'shchee', 'shchie', 'shchye', 'shchoe', 'shchei`', 'shchii`', 'shchyi`', 'shchoi`', 'shchem', 'shchim', 'shchym', 'shchom', 'vshimi', 'vshymi', 'vshego', 'vshogo', 'vshemu', 'vshomu', 'vshikh', 'vshykh', 'emui^u', 'emai^a', 'emoi^u', 'emei^u', 'nnui^u', 'nnai^a', 'nnoi^u', 'nnei^u', 'vshee', 'vshie', 'vshye', 'vshoe', 'vshei`', 'vshii`', 'vshyi`', 'vshoi`', 'vshem', 'vshim', 'vshym', 'vshom', 'emimi', 'emymi', 'emego', 'emogo', 'ememu', 'emomu', 'emikh', 'emykh', 'nnimi', 'nnymi', 'nnego', 'nnogo', 'nnemu', 'nnomu', 'nnikh', 'nnykh', 'emee', 'emie', 'emye', 'emoe', 'emei`', 'emii`', 'emyi`', 'emoi`', 'emem', 'emim', 'emym', 'emom', 'nnee', 'nnie', 'nnye', 'nnoe', 'nnei`', 'nnii`', 'nnyi`', 'nnoi`', 'nnem', 'nnim', 'nnym', 'nnom'): if (rv[-len(suffix) - 3:-len(suffix)] == "i^a" or rv[-len(suffix) - 1:-len(suffix)] == "a"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] adjectival_removed = True break else: word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] adjectival_removed = True break if not adjectival_removed: for suffix in self.__verb_suffixes: if rv.endswith(suffix): if suffix in ("la", "na", "ete", "i`te", "li", "i`", "l", "em", "n", "lo", "no", "et", "i^ut", "ny", "t'", "esh'", "nno"): if (rv[-len(suffix) - 3:-len(suffix)] == "i^a" or rv[-len(suffix) - 1:-len(suffix)] == "a"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] verb_removed = True break else: word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] verb_removed = True break if not adjectival_removed and not verb_removed: for suffix in self.__noun_suffixes: if rv.endswith(suffix): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] break # Step 2 if rv.endswith("i"): word = word[:-1] r2 = r2[:-1] # Step 3 for suffix in self.__derivational_suffixes: if r2.endswith(suffix): word = word[:-len(suffix)] break # Step 4 if word.endswith("nn"): word = word[:-1] undouble_success = True if not undouble_success: for suffix in self.__superlative_suffixes: if word.endswith(suffix): word = word[:-len(suffix)] superlative_removed = True break if word.endswith("nn"): word = word[:-1] if not undouble_success and not superlative_removed: if word.endswith("'"): word = word[:-1] if chr_exceeded: word = self.__roman_to_cyrillic(word) return word def __regions_russian(self, word): """ Return the regions RV and R2 which are used by the Russian stemmer. In any word, RV is the region after the first vowel, or the end of the word if it contains no vowel. R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel. R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel. :param word: The Russian word whose regions RV and R2 are determined. :type word: str or unicode :return: the regions RV and R2 for the respective Russian word. :rtype: tuple :note: This helper method is invoked by the stem method of the subclass RussianStemmer. It is not to be invoked directly! """ r1 = "" r2 = "" rv = "" vowels = ("A", "U", "E", "a", "e", "i", "o", "u", "y") word = (word.replace("i^a", "A") .replace("i^u", "U") .replace("e`", "E")) for i in range(1, len(word)): if word[i] not in vowels and word[i - 1] in vowels: r1 = word[i + 1:] break for i in range(1, len(r1)): if r1[i] not in vowels and r1[i - 1] in vowels: r2 = r1[i + 1:] break for i in range(len(word)): if word[i] in vowels: rv = word[i + 1:] break r2 = (r2.replace("A", "i^a") .replace("U", "i^u") .replace("E", "e`")) rv = (rv.replace("A", "i^a") .replace("U", "i^u") .replace("E", "e`")) return (rv, r2) def __cyrillic_to_roman(self, word): """ Transliterate a Russian word into the Roman alphabet. A Russian word whose letters consist of the Cyrillic alphabet are transliterated into the Roman alphabet in order to ease the forthcoming stemming process. :param word: The word that is transliterated. :type word: unicode :return: the transliterated word. :rtype: unicode :note: This helper method is invoked by the stem method of the subclass RussianStemmer. It is not to be invoked directly! """ word = (word.replace(u("\u0410"), "a").replace(u("\u0430"), "a") .replace(u("\u0411"), "b").replace(u("\u0431"), "b") .replace(u("\u0412"), "v").replace(u("\u0432"), "v") .replace(u("\u0413"), "g").replace(u("\u0433"), "g") .replace(u("\u0414"), "d").replace(u("\u0434"), "d") .replace(u("\u0415"), "e").replace(u("\u0435"), "e") .replace(u("\u0401"), "e").replace(u("\u0451"), "e") .replace(u("\u0416"), "zh").replace(u("\u0436"), "zh") .replace(u("\u0417"), "z").replace(u("\u0437"), "z") .replace(u("\u0418"), "i").replace(u("\u0438"), "i") .replace(u("\u0419"), "i`").replace(u("\u0439"), "i`") .replace(u("\u041A"), "k").replace(u("\u043A"), "k") .replace(u("\u041B"), "l").replace(u("\u043B"), "l") .replace(u("\u041C"), "m").replace(u("\u043C"), "m") .replace(u("\u041D"), "n").replace(u("\u043D"), "n") .replace(u("\u041E"), "o").replace(u("\u043E"), "o") .replace(u("\u041F"), "p").replace(u("\u043F"), "p") .replace(u("\u0420"), "r").replace(u("\u0440"), "r") .replace(u("\u0421"), "s").replace(u("\u0441"), "s") .replace(u("\u0422"), "t").replace(u("\u0442"), "t") .replace(u("\u0423"), "u").replace(u("\u0443"), "u") .replace(u("\u0424"), "f").replace(u("\u0444"), "f") .replace(u("\u0425"), "kh").replace(u("\u0445"), "kh") .replace(u("\u0426"), "t^s").replace(u("\u0446"), "t^s") .replace(u("\u0427"), "ch").replace(u("\u0447"), "ch") .replace(u("\u0428"), "sh").replace(u("\u0448"), "sh") .replace(u("\u0429"), "shch").replace(u("\u0449"), "shch") .replace(u("\u042A"), "''").replace(u("\u044A"), "''") .replace(u("\u042B"), "y").replace(u("\u044B"), "y") .replace(u("\u042C"), "'").replace(u("\u044C"), "'") .replace(u("\u042D"), "e`").replace(u("\u044D"), "e`") .replace(u("\u042E"), "i^u").replace(u("\u044E"), "i^u") .replace(u("\u042F"), "i^a").replace(u("\u044F"), "i^a")) return word def __roman_to_cyrillic(self, word): """ Transliterate a Russian word back into the Cyrillic alphabet. A Russian word formerly transliterated into the Roman alphabet in order to ease the stemming process, is transliterated back into the Cyrillic alphabet, its original form. :param word: The word that is transliterated. :type word: str or unicode :return: word, the transliterated word. :rtype: unicode :note: This helper method is invoked by the stem method of the subclass RussianStemmer. It is not to be invoked directly! """ word = (word.replace("i^u", u("\u044E")).replace("i^a", u("\u044F")) .replace("shch", u("\u0449")).replace("kh", u("\u0445")) .replace("t^s", u("\u0446")).replace("ch", u("\u0447")) .replace("e`", u("\u044D")).replace("i`", u("\u0439")) .replace("sh", u("\u0448")).replace("k", u("\u043A")) .replace("e", u("\u0435")).replace("zh", u("\u0436")) .replace("a", u("\u0430")).replace("b", u("\u0431")) .replace("v", u("\u0432")).replace("g", u("\u0433")) .replace("d", u("\u0434")).replace("e", u("\u0435")) .replace("z", u("\u0437")).replace("i", u("\u0438")) .replace("l", u("\u043B")).replace("m", u("\u043C")) .replace("n", u("\u043D")).replace("o", u("\u043E")) .replace("p", u("\u043F")).replace("r", u("\u0440")) .replace("s", u("\u0441")).replace("t", u("\u0442")) .replace("u", u("\u0443")).replace("f", u("\u0444")) .replace("''", u("\u044A")).replace("y", u("\u044B")) .replace("'", u("\u044C"))) return word