EVOLUTION-MANAGER
Edit File: french.py
from .bases import _StandardStemmer from whoosh.compat import u class FrenchStemmer(_StandardStemmer): """ The French Snowball stemmer. :cvar __vowels: The French vowels. :type __vowels: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm. :type __step2a_suffixes: tuple :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm. :type __step2b_suffixes: tuple :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. :type __step4_suffixes: tuple :note: A detailed description of the French stemming algorithm can be found under http://snowball.tartarus.org/algorithms/french/stemmer.html """ __vowels = u("aeiouy\xE2\xE0\xEB\xE9\xEA\xE8\xEF\xEE\xF4\xFB\xF9") __step1_suffixes = ('issements', 'issement', 'atrices', 'atrice', 'ateurs', 'ations', 'logies', 'usions', 'utions', 'ements', 'amment', 'emment', 'ances', 'iqUes', 'ismes', 'ables', 'istes', 'ateur', 'ation', 'logie', 'usion', 'ution', 'ences', 'ement', 'euses', 'ments', 'ance', 'iqUe', 'isme', 'able', 'iste', 'ence', u('it\xE9s'), 'ives', 'eaux', 'euse', 'ment', 'eux', u('it\xE9'), 'ive', 'ifs', 'aux', 'if') __step2a_suffixes = ('issaIent', 'issantes', 'iraIent', 'issante', 'issants', 'issions', 'irions', 'issais', 'issait', 'issant', 'issent', 'issiez', 'issons', 'irais', 'irait', 'irent', 'iriez', 'irons', 'iront', 'isses', 'issez', u('\xEEmes'), u('\xEEtes'), 'irai', 'iras', 'irez', 'isse', 'ies', 'ira', u('\xEEt'), 'ie', 'ir', 'is', 'it', 'i') __step2b_suffixes = ('eraIent', 'assions', 'erions', 'assent', 'assiez', u('\xE8rent'), 'erais', 'erait', 'eriez', 'erons', 'eront', 'aIent', 'antes', 'asses', 'ions', 'erai', 'eras', 'erez', u('\xE2mes'), u('\xE2tes'), 'ante', 'ants', 'asse', u('\xE9es'), 'era', 'iez', 'ais', 'ait', 'ant', u('\xE9e'), u('\xE9s'), 'er', 'ez', u('\xE2t'), 'ai', 'as', u('\xE9'), 'a') __step4_suffixes = (u('i\xE8re'), u('I\xE8re'), 'ion', 'ier', 'Ier', 'e', u('\xEB')) def stem(self, word): """ Stem a French word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() step1_success = False rv_ending_found = False step2a_success = False step2b_success = False # Every occurrence of 'u' after 'q' is put into upper case. for i in range(1, len(word)): if word[i - 1] == "q" and word[i] == "u": word = "".join((word[:i], "U", word[i + 1:])) # Every occurrence of 'u' and 'i' # between vowels is put into upper case. # Every occurrence of 'y' preceded or # followed by a vowel is also put into upper case. for i in range(1, len(word) - 1): if word[i - 1] in self.__vowels and word[i + 1] in self.__vowels: if word[i] == "u": word = "".join((word[:i], "U", word[i + 1:])) elif word[i] == "i": word = "".join((word[:i], "I", word[i + 1:])) if word[i - 1] in self.__vowels or word[i + 1] in self.__vowels: if word[i] == "y": word = "".join((word[:i], "Y", word[i + 1:])) r1, r2 = self._r1r2_standard(word, self.__vowels) rv = self.__rv_french(word, self.__vowels) # STEP 1: Standard suffix removal for suffix in self.__step1_suffixes: if word.endswith(suffix): if suffix == "eaux": word = word[:-1] step1_success = True elif suffix in ("euse", "euses"): if suffix in r2: word = word[:-len(suffix)] step1_success = True elif suffix in r1: word = "".join((word[:-len(suffix)], "eux")) step1_success = True elif suffix in ("ement", "ements") and suffix in rv: word = word[:-len(suffix)] step1_success = True if word[-2:] == "iv" and "iv" in r2: word = word[:-2] if word[-2:] == "at" and "at" in r2: word = word[:-2] elif word[-3:] == "eus": if "eus" in r2: word = word[:-3] elif "eus" in r1: word = "".join((word[:-1], "x")) elif word[-3:] in ("abl", "iqU"): if "abl" in r2 or "iqU" in r2: word = word[:-3] elif word[-3:] in (u("i\xE8r"), u("I\xE8r")): if u("i\xE8r") in rv or u("I\xE8r") in rv: word = "".join((word[:-3], "i")) elif suffix == "amment" and suffix in rv: word = "".join((word[:-6], "ant")) rv = "".join((rv[:-6], "ant")) rv_ending_found = True elif suffix == "emment" and suffix in rv: word = "".join((word[:-6], "ent")) rv_ending_found = True elif (suffix in ("ment", "ments") and suffix in rv and not rv.startswith(suffix) and rv[rv.rindex(suffix) - 1] in self.__vowels): word = word[:-len(suffix)] rv = rv[:-len(suffix)] rv_ending_found = True elif suffix == "aux" and suffix in r1: word = "".join((word[:-2], "l")) step1_success = True elif (suffix in ("issement", "issements") and suffix in r1 and word[-len(suffix) - 1] not in self.__vowels): word = word[:-len(suffix)] step1_success = True elif suffix in ("ance", "iqUe", "isme", "able", "iste", "eux", "ances", "iqUes", "ismes", "ables", "istes") and suffix in r2: word = word[:-len(suffix)] step1_success = True elif suffix in ("atrice", "ateur", "ation", "atrices", "ateurs", "ations") and suffix in r2: word = word[:-len(suffix)] step1_success = True if word[-2:] == "ic": if "ic" in r2: word = word[:-2] else: word = "".join((word[:-2], "iqU")) elif suffix in ("logie", "logies") and suffix in r2: word = "".join((word[:-len(suffix)], "log")) step1_success = True elif (suffix in ("usion", "ution", "usions", "utions") and suffix in r2): word = "".join((word[:-len(suffix)], "u")) step1_success = True elif suffix in ("ence", "ences") and suffix in r2: word = "".join((word[:-len(suffix)], "ent")) step1_success = True elif suffix in (u("it\xE9"), u("it\xE9s")) and suffix in r2: word = word[:-len(suffix)] step1_success = True if word[-4:] == "abil": if "abil" in r2: word = word[:-4] else: word = "".join((word[:-2], "l")) elif word[-2:] == "ic": if "ic" in r2: word = word[:-2] else: word = "".join((word[:-2], "iqU")) elif word[-2:] == "iv": if "iv" in r2: word = word[:-2] elif (suffix in ("if", "ive", "ifs", "ives") and suffix in r2): word = word[:-len(suffix)] step1_success = True if word[-2:] == "at" and "at" in r2: word = word[:-2] if word[-2:] == "ic": if "ic" in r2: word = word[:-2] else: word = "".join((word[:-2], "iqU")) break # STEP 2a: Verb suffixes beginning 'i' if not step1_success or rv_ending_found: for suffix in self.__step2a_suffixes: if word.endswith(suffix): if (suffix in rv and len(rv) > len(suffix) and rv[rv.rindex(suffix) - 1] not in self.__vowels): word = word[:-len(suffix)] step2a_success = True break # STEP 2b: Other verb suffixes if not step2a_success: for suffix in self.__step2b_suffixes: if rv.endswith(suffix): if suffix == "ions" and "ions" in r2: word = word[:-4] step2b_success = True elif suffix in ('eraIent', 'erions', u('\xE8rent'), 'erais', 'erait', 'eriez', 'erons', 'eront', 'erai', 'eras', 'erez', u('\xE9es'), 'era', 'iez', u('\xE9e'), u('\xE9s'), 'er', 'ez', u('\xE9')): word = word[:-len(suffix)] step2b_success = True elif suffix in ('assions', 'assent', 'assiez', 'aIent', 'antes', 'asses', u('\xE2mes'), u('\xE2tes'), 'ante', 'ants', 'asse', 'ais', 'ait', 'ant', u('\xE2t'), 'ai', 'as', 'a'): word = word[:-len(suffix)] rv = rv[:-len(suffix)] step2b_success = True if rv.endswith("e"): word = word[:-1] break # STEP 3 if step1_success or step2a_success or step2b_success: if word[-1] == "Y": word = "".join((word[:-1], "i")) elif word[-1] == u("\xE7"): word = "".join((word[:-1], "c")) # STEP 4: Residual suffixes else: if (len(word) >= 2 and word[-1] == "s" and word[-2] not in u("aiou\xE8s")): word = word[:-1] for suffix in self.__step4_suffixes: if word.endswith(suffix): if suffix in rv: if (suffix == "ion" and suffix in r2 and rv[-4] in "st"): word = word[:-3] elif suffix in ("ier", u("i\xE8re"), "Ier", u("I\xE8re")): word = "".join((word[:-len(suffix)], "i")) elif suffix == "e": word = word[:-1] elif suffix == u("\xEB") and word[-3:-1] == "gu": word = word[:-1] break # STEP 5: Undouble if word.endswith(("enn", "onn", "ett", "ell", "eill")): word = word[:-1] # STEP 6: Un-accent for i in range(1, len(word)): if word[-i] not in self.__vowels: i += 1 else: if i != 1 and word[-i] in (u("\xE9"), u("\xE8")): word = "".join((word[:-i], "e", word[-i + 1:])) break word = (word.replace("I", "i") .replace("U", "u") .replace("Y", "y")) return word def __rv_french(self, word, vowels): """ Return the region RV that is used by the French stemmer. If the word begins with two vowels, RV is the region after the third letter. Otherwise, it is the region after the first vowel not at the beginning of the word, or the end of the word if these positions cannot be found. (Exceptionally, u'par', u'col' or u'tap' at the beginning of a word is also taken to define RV as the region to their right.) :param word: The French word whose region RV is determined. :type word: str or unicode :param vowels: The French vowels that are used to determine the region RV. :type vowels: unicode :return: the region RV for the respective French word. :rtype: unicode :note: This helper method is invoked by the stem method of the subclass FrenchStemmer. It is not to be invoked directly! """ rv = "" if len(word) >= 2: if (word.startswith(("par", "col", "tap")) or (word[0] in vowels and word[1] in vowels)): rv = word[3:] else: for i in range(1, len(word)): if word[i] in vowels: rv = word[i + 1:] break return rv