EVOLUTION-MANAGER
Edit File: __init__.py
# coding=utf-8 # Copyright 2012 Matt Chaput. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # # 1. Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # # THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR # IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO # EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, # OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, # EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # The views and conclusions contained in the software and documentation are # those of the authors and should not be interpreted as representing official # policies, either expressed or implied, of Matt Chaput. # Exceptions class NoStemmer(Exception): pass class NoStopWords(Exception): pass # Data and functions for language names languages = ("ar", "da", "nl", "en", "fi", "fr", "de", "hu", "it", "no", "pt", "ro", "ru", "es", "sv", "tr") aliases = { # By ISO 639-1 three letter codes "ara": "ar", "dan": "da", "nld": "nl", "eng": "en", "fin": "fi", "fra": "fr", "deu": "de", "hun": "hu", "ita": "it", "nor": "no", "por": "pt", "ron": "ro", "rus": "ru", "spa": "es", "swe": "sv", "tur": "tr", # By name in English "arabic": "ar", "danish": "da", "dutch": "nl", "english": "en", "finnish": "fi", "french": "fr", "german": "de", "hungarian": "hu", "italian": "it", "norwegian": "no", "portuguese": "pt", "romanian": "ro", "russian": "ru", "spanish": "es", "swedish": "sv", "turkish": "tr", # By name in own language "العربية": "ar", "dansk": "da", "nederlands": "nl", "suomi": "fi", "français": "fr", "deutsch": "de", "magyar": "hu", "italiano": "it", "norsk": "no", "português": "pt", "русский язык": "ru", "español": "es", "svenska": "sv", "türkçe": "tr", } def two_letter_code(name): if name in languages: return name if name in aliases: return aliases[name] return None # Getter functions def has_stemmer(lang): try: return bool(stemmer_for_language(lang)) except NoStemmer: return False def has_stopwords(lang): try: return bool(stopwords_for_language(lang)) except NoStopWords: return False def stemmer_for_language(lang): if lang == "en_porter": # Original porter stemming algorithm is several times faster than the # more correct porter2 algorithm in snowball package from .porter import stem as porter_stem return porter_stem tlc = two_letter_code(lang) if tlc == "ar": from .isri import ISRIStemmer return ISRIStemmer().stem from .snowball import classes as snowball_classes if tlc in snowball_classes: return snowball_classes[tlc]().stem raise NoStemmer("No stemmer available for %r" % lang) def stopwords_for_language(lang): from .stopwords import stoplists tlc = two_letter_code(lang) if tlc in stoplists: return stoplists[tlc] raise NoStopWords("No stop-word list available for %r" % lang)