EVOLUTION-MANAGER
Edit File: hungarian.py
from whoosh.compat import u class HungarianStemmer(object): """ The Hungarian Snowball stemmer. :cvar __vowels: The Hungarian vowels. :type __vowels: unicode :cvar __digraphs: The Hungarian digraphs. :type __digraphs: tuple :cvar __double_consonants: The Hungarian double consonants. :type __double_consonants: tuple :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. :type __step4_suffixes: tuple :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm. :type __step5_suffixes: tuple :cvar __step6_suffixes: Suffixes to be deleted in step 6 of the algorithm. :type __step6_suffixes: tuple :cvar __step7_suffixes: Suffixes to be deleted in step 7 of the algorithm. :type __step7_suffixes: tuple :cvar __step8_suffixes: Suffixes to be deleted in step 8 of the algorithm. :type __step8_suffixes: tuple :cvar __step9_suffixes: Suffixes to be deleted in step 9 of the algorithm. :type __step9_suffixes: tuple :note: A detailed description of the Hungarian stemming algorithm can be found under http://snowball.tartarus.org/algorithms/hungarian/stemmer.html """ __vowels = u("aeiou\xF6\xFC\xE1\xE9\xED\xF3\xF5\xFA\xFB") __digraphs = ("cs", "dz", "dzs", "gy", "ly", "ny", "ty", "zs") __double_consonants = ("bb", "cc", "ccs", "dd", "ff", "gg", "ggy", "jj", "kk", "ll", "lly", "mm", "nn", "nny", "pp", "rr", "ss", "ssz", "tt", "tty", "vv", "zz", "zzs") __step1_suffixes = ("al", "el") __step2_suffixes = (u('k\xE9ppen'), u('onk\xE9nt'), u('enk\xE9nt'), u('ank\xE9nt'), u('k\xE9pp'), u('k\xE9nt'), 'ban', 'ben', 'nak', 'nek', 'val', 'vel', u('t\xF3l'), u('t\xF5l'), u('r\xF3l'), u('r\xF5l'), u('b\xF3l'), u('b\xF5l'), 'hoz', 'hez', u('h\xF6z'), u('n\xE1l'), u('n\xE9l'), u('\xE9rt'), 'kor', 'ba', 'be', 'ra', 're', 'ig', 'at', 'et', 'ot', u('\xF6t'), 'ul', u('\xFCl'), u('v\xE1'), u('v\xE9'), 'en', 'on', 'an', u('\xF6n'), 'n', 't') __step3_suffixes = (u("\xE1nk\xE9nt"), u("\xE1n"), u("\xE9n")) __step4_suffixes = ('astul', u('est\xFCl'), u('\xE1stul'), u('\xE9st\xFCl'), 'stul', u('st\xFCl')) __step5_suffixes = (u("\xE1"), u("\xE9")) __step6_suffixes = (u('ok\xE9'), u('\xF6k\xE9'), u('ak\xE9'), u('ek\xE9'), u('\xE1k\xE9'), u('\xE1\xE9i'), u('\xE9k\xE9'), u('\xE9\xE9i'), u('k\xE9'), u('\xE9i'), u('\xE9\xE9'), u('\xE9')) __step7_suffixes = (u('\xE1juk'), u('\xE9j\xFCk'), u('\xFCnk'), 'unk', 'juk', u('j\xFCk'), u('\xE1nk'), u('\xE9nk'), 'nk', 'uk', u('\xFCk'), 'em', 'om', 'am', 'od', 'ed', 'ad', u('\xF6d'), 'ja', 'je', u('\xE1m'), u('\xE1d'), u('\xE9m'), u('\xE9d'), 'm', 'd', 'a', 'e', 'o', u('\xE1'), u('\xE9')) __step8_suffixes = ('jaitok', 'jeitek', 'jaink', 'jeink', 'aitok', 'eitek', u('\xE1itok'), u('\xE9itek'), 'jaim', 'jeim', 'jaid', 'jeid', 'eink', 'aink', 'itek', 'jeik', 'jaik', u('\xE1ink'), u('\xE9ink'), 'aim', 'eim', 'aid', 'eid', 'jai', 'jei', 'ink', 'aik', 'eik', u('\xE1im'), u('\xE1id'), u('\xE1ik'), u('\xE9im'), u('\xE9id'), u('\xE9ik'), 'im', 'id', 'ai', 'ei', 'ik', u('\xE1i'), u('\xE9i'), 'i') __step9_suffixes = (u("\xE1k"), u("\xE9k"), u("\xF6k"), "ok", "ek", "ak", "k") def stem(self, word): """ Stem an Hungarian word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() r1 = self.__r1_hungarian(word, self.__vowels, self.__digraphs) # STEP 1: Remove instrumental case if r1.endswith(self.__step1_suffixes): for double_cons in self.__double_consonants: if word[-2 - len(double_cons):-2] == double_cons: word = "".join((word[:-4], word[-3])) if r1[-2 - len(double_cons):-2] == double_cons: r1 = "".join((r1[:-4], r1[-3])) break # STEP 2: Remove frequent cases for suffix in self.__step2_suffixes: if word.endswith(suffix): if r1.endswith(suffix): word = word[:-len(suffix)] r1 = r1[:-len(suffix)] if r1.endswith(u("\xE1")): word = "".join((word[:-1], "a")) r1 = "".join((r1[:-1], "a")) elif r1.endswith(u("\xE9")): word = "".join((word[:-1], "e")) r1 = "".join((r1[:-1], "e")) break # STEP 3: Remove special cases for suffix in self.__step3_suffixes: if r1.endswith(suffix): if suffix == u("\xE9n"): word = "".join((word[:-2], "e")) r1 = "".join((r1[:-2], "e")) else: word = "".join((word[:-len(suffix)], "a")) r1 = "".join((r1[:-len(suffix)], "a")) break # STEP 4: Remove other cases for suffix in self.__step4_suffixes: if r1.endswith(suffix): if suffix == u("\xE1stul"): word = "".join((word[:-5], "a")) r1 = "".join((r1[:-5], "a")) elif suffix == u("\xE9st\xFCl"): word = "".join((word[:-5], "e")) r1 = "".join((r1[:-5], "e")) else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] break # STEP 5: Remove factive case for suffix in self.__step5_suffixes: if r1.endswith(suffix): for double_cons in self.__double_consonants: if word[-1 - len(double_cons):-1] == double_cons: word = "".join((word[:-3], word[-2])) if r1[-1 - len(double_cons):-1] == double_cons: r1 = "".join((r1[:-3], r1[-2])) break # STEP 6: Remove owned for suffix in self.__step6_suffixes: if r1.endswith(suffix): if suffix in (u("\xE1k\xE9"), u("\xE1\xE9i")): word = "".join((word[:-3], "a")) r1 = "".join((r1[:-3], "a")) elif suffix in (u("\xE9k\xE9"), u("\xE9\xE9i"), u("\xE9\xE9")): word = "".join((word[:-len(suffix)], "e")) r1 = "".join((r1[:-len(suffix)], "e")) else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] break # STEP 7: Remove singular owner suffixes for suffix in self.__step7_suffixes: if word.endswith(suffix): if r1.endswith(suffix): if suffix in (u("\xE1nk"), u("\xE1juk"), u("\xE1m"), u("\xE1d"), u("\xE1")): word = "".join((word[:-len(suffix)], "a")) r1 = "".join((r1[:-len(suffix)], "a")) elif suffix in (u("\xE9nk"), u("\xE9j\xFCk"), u("\xE9m"), u("\xE9d"), u("\xE9")): word = "".join((word[:-len(suffix)], "e")) r1 = "".join((r1[:-len(suffix)], "e")) else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] break # STEP 8: Remove plural owner suffixes for suffix in self.__step8_suffixes: if word.endswith(suffix): if r1.endswith(suffix): if suffix in (u("\xE1im"), u("\xE1id"), u("\xE1i"), u("\xE1ink"), u("\xE1itok"), u("\xE1ik")): word = "".join((word[:-len(suffix)], "a")) r1 = "".join((r1[:-len(suffix)], "a")) elif suffix in (u("\xE9im"), u("\xE9id"), u("\xE9i"), u("\xE9ink"), u("\xE9itek"), u("\xE9ik")): word = "".join((word[:-len(suffix)], "e")) r1 = "".join((r1[:-len(suffix)], "e")) else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] break # STEP 9: Remove plural suffixes for suffix in self.__step9_suffixes: if word.endswith(suffix): if r1.endswith(suffix): if suffix == u("\xE1k"): word = "".join((word[:-2], "a")) elif suffix == u("\xE9k"): word = "".join((word[:-2], "e")) else: word = word[:-len(suffix)] break return word def __r1_hungarian(self, word, vowels, digraphs): """ Return the region R1 that is used by the Hungarian stemmer. If the word begins with a vowel, R1 is defined as the region after the first consonant or digraph (= two letters stand for one phoneme) in the word. If the word begins with a consonant, it is defined as the region after the first vowel in the word. If the word does not contain both a vowel and consonant, R1 is the null region at the end of the word. :param word: The Hungarian word whose region R1 is determined. :type word: str or unicode :param vowels: The Hungarian vowels that are used to determine the region R1. :type vowels: unicode :param digraphs: The digraphs that are used to determine the region R1. :type digraphs: tuple :return: the region R1 for the respective word. :rtype: unicode :note: This helper method is invoked by the stem method of the subclass HungarianStemmer. It is not to be invoked directly! """ r1 = "" if word[0] in vowels: for digraph in digraphs: if digraph in word[1:]: r1 = word[word.index(digraph[-1]) + 1:] return r1 for i in range(1, len(word)): if word[i] not in vowels: r1 = word[i + 1:] break else: for i in range(1, len(word)): if word[i] in vowels: r1 = word[i + 1:] break return r1