EVOLUTION-MANAGER
Edit File: termgenerator.h
/** @file termgenerator.h * @brief parse free text and generate terms */ /* Copyright (C) 2007,2009,2011,2012 Olly Betts * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef XAPIAN_INCLUDED_TERMGENERATOR_H #define XAPIAN_INCLUDED_TERMGENERATOR_H #include <xapian/base.h> #include <xapian/types.h> #include <xapian/unicode.h> #include <xapian/visibility.h> #include <string> namespace Xapian { class Document; class Stem; class Stopper; class WritableDatabase; /** Parses a piece of text and generate terms. * * This module takes a piece of text and parses it to produce words which are * then used to generate suitable terms for indexing. The terms generated are * suitable for use with Query objects produced by the QueryParser class. */ class XAPIAN_VISIBILITY_DEFAULT TermGenerator { public: /// @private @internal Class representing the TermGenerator internals. class Internal; /// @private @internal Reference counted internals. Xapian::Internal::RefCntPtr<Internal> internal; /// Copy constructor. TermGenerator(const TermGenerator & o); /// Assignment. TermGenerator & operator=(const TermGenerator & o); /// Default constructor. TermGenerator(); /// Destructor. ~TermGenerator(); /// Set the Xapian::Stem object to be used for generating stemmed terms. void set_stemmer(const Xapian::Stem & stemmer); /** Set the Xapian::Stopper object to be used for identifying stopwords. * * Stemmed forms of stopwords aren't indexed, but unstemmed forms still * are so that searches for phrases including stop words still work. * * @param stop The Stopper object to set (default NULL, which means no * stopwords). */ void set_stopper(const Xapian::Stopper *stop = NULL); /// Set the current document. void set_document(const Xapian::Document & doc); /// Get the current document. const Xapian::Document & get_document() const; /// Set the database to index spelling data to. void set_database(const Xapian::WritableDatabase &db); /// Flags to OR together and pass to TermGenerator::set_flags(). enum flags { /// Index data required for spelling correction. FLAG_SPELLING = 128, // Value matches QueryParser flag. /** Enable generation of n-grams from CJK text. * * With this enabled, spans of CJK characters are split into unigrams * and bigrams, with the unigrams carrying positional information. * Non-CJK characters are split into words as normal. * * The corresponding option needs to be passed to QueryParser. * * Flag added in Xapian 1.3.4 and 1.2.22, but this mode can be * enabled in 1.2.8 and later by setting environment variable * XAPIAN_CJK_NGRAM. */ FLAG_CJK_NGRAM = 2048 // Value matches QueryParser flag. }; /// Stemming strategies, for use with set_stemming_strategy(). typedef enum { STEM_NONE, STEM_SOME, STEM_ALL, STEM_ALL_Z } stem_strategy; /** Set flags. * * The new value of flags is: (flags & mask) ^ toggle * * To just set the flags, pass the new flags in toggle and the * default value for mask. * * @param toggle Flags to XOR. * @param mask Flags to AND with first. * * @return The old flags setting. */ flags set_flags(flags toggle, flags mask = flags(0)); /** Set the stemming strategy. * * This method controls how the stemming algorithm is applied. It was * new in Xapian 1.3.1. * * @param strategy The strategy to use - possible values are: * - STEM_NONE: Don't perform any stemming - only unstemmed terms * are generated. * - STEM_SOME: Generate both stemmed (with a "Z" prefix) and unstemmed * terms. This is the default strategy. * - STEM_ALL: Generate only stemmed terms (but without a "Z" prefix). * - STEM_ALL_Z: Generate only stemmed terms (with a "Z" prefix). */ void set_stemming_strategy(stem_strategy strategy); /** Set the maximum length word to index. * * The limit is on the length of a word prior to stemming and prior to * adding any term prefix. * * The backends mostly impose a limit on the length of terms (often of * about 240 bytes), but it's generally useful to have a lower limit to * help prevent the index being bloated by useless junk terms from trying * to indexing things like binary data, uuencoded data, ASCII art, etc. * * This method was new in Xapian 1.3.1. * * @param max_word_length The maximum length word to index, in bytes in * UTF-8 representation. Default is 64. */ void set_max_word_length(unsigned max_word_length); /** Index some text. * * @param itor Utf8Iterator pointing to the text to index. * @param wdf_inc The wdf increment (default 1). * @param prefix The term prefix to use (default is no prefix). */ void index_text(const Xapian::Utf8Iterator & itor, Xapian::termcount wdf_inc = 1, const std::string & prefix = std::string()); /** Index some text in a std::string. * * @param text The text to index. * @param wdf_inc The wdf increment (default 1). * @param prefix The term prefix to use (default is no prefix). */ void index_text(const std::string & text, Xapian::termcount wdf_inc = 1, const std::string & prefix = std::string()) { return index_text(Utf8Iterator(text), wdf_inc, prefix); } /** Index some text without positional information. * * Just like index_text, but no positional information is generated. This * means that the database will be significantly smaller, but that phrase * searching and NEAR won't be supported. * * @param itor Utf8Iterator pointing to the text to index. * @param wdf_inc The wdf increment (default 1). * @param prefix The term prefix to use (default is no prefix). */ void index_text_without_positions(const Xapian::Utf8Iterator & itor, Xapian::termcount wdf_inc = 1, const std::string & prefix = std::string()); /** Index some text in a std::string without positional information. * * Just like index_text, but no positional information is generated. This * means that the database will be significantly smaller, but that phrase * searching and NEAR won't be supported. * * @param text The text to index. * @param wdf_inc The wdf increment (default 1). * @param prefix The term prefix to use (default is no prefix). */ void index_text_without_positions(const std::string & text, Xapian::termcount wdf_inc = 1, const std::string & prefix = std::string()) { return index_text_without_positions(Utf8Iterator(text), wdf_inc, prefix); } /** Increase the term position used by index_text. * * This can be used between indexing text from different fields or other * places to prevent phrase searches from spanning between them (e.g. * between the title and body text, or between two chapters in a book). * * @param delta Amount to increase the term position by (default: 100). */ void increase_termpos(Xapian::termcount delta = 100); /// Get the current term position. Xapian::termcount get_termpos() const; /** Set the current term position. * * @param termpos The new term position to set. */ void set_termpos(Xapian::termcount termpos); /// Return a string describing this object. std::string get_description() const; }; } #endif // XAPIAN_INCLUDED_TERMGENERATOR_H