sphinxclient.h

Go to the documentation of this file.
00001 /*
00002  *
00003  * C++ sphinx search client library
00004  * Copyright (C) 2007  Seznam.cz, a.s.
00005  *
00006  * This library is free software; you can redistribute it and/or
00007  * modify it under the terms of the GNU Lesser General Public
00008  * License as published by the Free Software Foundation; either
00009  * version 2.1 of the License, or (at your option) any later version.
00010  *
00011  * This library is distributed in the hope that it will be useful,
00012  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00013  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00014  * Lesser General Public License for more details.
00015  *
00016  * You should have received a copy of the GNU Lesser General Public
00017  * License along with this library; if not, write to the Free Software
00018  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
00019  *
00020  * Seznam.cz, a.s.
00021  * Radlicka 2, Praha 5, 15000, Czech Republic
00022  * http://www.seznam.cz, mailto:sphinxclient@firma.seznam.cz
00023  *
00024  *
00025  * $Id: sphinxclient.h 35 2013-03-07 13:21:17Z honkir $
00026  *
00027  * DESCRIPTION
00028  * SphinxClient header file - communication library for the sphinx
00029  * search server
00030  *
00031  * AUTHOR
00032  * Jan Kirschner <jan.kirschner@firma.seznam.cz>
00033  *
00034  * HISTORY
00035  * 2006-12-11 (jan.kirschner)
00036  *            First draft.
00037  */
00038 
00040 
00041 #ifndef __SPHINXAPI_H__
00042 #define __SPHINXAPI_H__
00043 
00044 #include <sphinxclient/sphinxclientquery.h>
00045 #include <sphinxclient/value.h>
00046 #include <sphinxclient/globals_public.h>
00047 
00048 #include <sstream>
00049 #include <string>
00050 #include <vector>
00051 #include <list>
00052 #include <map>
00053 #include <stdint.h>
00054 
00055 #include "error.h"
00056 
00057 namespace Sphinx
00058 {
00059 
00060 //------------------------------------------------------------------------------
00061 // Private data structures forward declarations
00062 //------------------------------------------------------------------------------
00063 
00064 class Query_t;
00065 class Client_t;
00066 class Filter_t;
00067 
00068 //------------------------------------------------------------------------------
00069 #define DEFAULT_CONNECT_RETRIES 1
00070 #define CONNECT_RETRY_WAIT_DEFAULT_MS 300
00071 // --------------------- configuration -----------------------------------------
00072 
00073 
00077 enum MatchMode_t { 
00078                    SPH_MATCH_ALL = 0,
00080                    SPH_MATCH_ANY = 1,
00082                    SPH_MATCH_PHRASE = 2,
00084                    SPH_MATCH_BOOLEAN = 3,
00087                    SPH_MATCH_EXTENDED = 4,
00089                    SPH_MATCH_FULLSCAN = 5,
00091                    //         versions of sphinx
00092                    SPH_MATCH_EXTENDED2 = 6,
00094                    SPH_MATCH_SZN = 7};
00095 
00099 enum SortMode_t { SPH_SORT_RELEVANCE = 0, 
00100                   SPH_SORT_DATE_DESC = 1, 
00101                   //SPH_SORT_ATTR_DESC = 1,
00102                   SPH_SORT_DATE_ASC = 2,  
00103                   //SPH_SORT_ATTR_ASC = 2,
00104                   SPH_SORT_TIME_SEGMENTS = 3,
00106                   SPH_SORT_EXTENDED = 4,
00108                   SPH_SORT_EXPR = 5,
00109                   SPH_SORT_SZN = 100
00110                 };
00111 
00115 enum RankingMode_t { 
00116                      //         BM25 minor one
00117                      SPH_RANK_PROXIMITY_BM25 = 0,
00119                      //         worse quality)
00120                      SPH_RANK_BM25 = 1,
00122                      SPH_RANK_NONE = 2,
00124                      //         sum of per-field keyword occurence counts
00125                      SPH_RANK_WORDCOUNT = 3,
00127                      SPH_RANK_PROXIMITY = 4,
00129                      SPH_RANK_MATCHANY = 5,
00131                      //         set to 1 when a keyword is located there
00132                      SPH_RANK_FIELDMASK = 6,
00133                      SPH_RANK_SPH04 = 7,
00134                      SPH_RANK_EXPR = 8,
00135                      SPH_RANK_TOTAL = 9
00136                    };
00137 
00143 enum AttributeType_t { SPH_ATTR_INTEGER = 1,     
00144                        SPH_ATTR_TIMESTAMP = 2,   
00145                        SPH_ATTR_ORDINAL = 3,     
00146                        SPH_ATTR_BOOL = 4,        
00147                        SPH_ATTR_FLOAT = 5,       
00148                        SPH_ATTR_BIGINT = 6,      
00149                        SPH_ATTR_STRING = 7,
00150                        SPH_ATTR_MULTI_FLAG = 0x40000000,
00151                        SPH_ATTR_MULTI = SPH_ATTR_MULTI_FLAG | 1,
00152                        SPH_ATTR_MULTI64 = SPH_ATTR_MULTI_FLAG | 2
00153                      };
00154 
00160 enum GroupFunction_t { SPH_GROUPBY_DAY = 0,
00161                        SPH_GROUPBY_WEEK = 1,
00162                        SPH_GROUPBY_MONTH = 2,
00163                        SPH_GROUPBY_YEAR = 3,
00164                        SPH_GROUPBY_ATTR = 4,
00165                        SPH_GROUPBY_ATTRPAIR = 5 };
00166 
00167 
00176 class ConnectionConfig_t
00177 {
00179     struct PrivateData_t;
00181     PrivateData_t *d;
00182 public:
00183     /* @brief Contructor
00184      * @param host host where the searchd runs
00185      * @param port port what the searchd listens on
00186      * @param keepAlive ignored, currently not supported
00187      * @param connectTimeout max wait time for connection setup
00188      * @param readTimeout max wait time on read from socket
00189      *        (resets when something has been read)
00190      * @param writeTimeout max wait time on write from socket
00191      *        (resets when something has been written)
00192      * @param connectRetriesCount nr of connect attempts
00193      * @param connectRetryWait delay after failed connect attempt
00194      */
00195     ConnectionConfig_t(const std::string &host = "localhost",
00196                        unsigned short port = 3312,
00197                        bool keepAlive = true,
00198                        int32_t connectTimeout = 1000,
00199                        int32_t readTimeout = 3000,
00200                        int32_t writeTimeout = 3000,
00201                        int32_t connectRetriesCount = DEFAULT_CONNECT_RETRIES,
00202                        int32_t connectRetryWait = CONNECT_RETRY_WAIT_DEFAULT_MS);
00203     /* @brief copy constructor
00204      */
00205     ConnectionConfig_t(const ConnectionConfig_t &from);
00206     /* @brief assignment operator
00207      */
00208     ConnectionConfig_t &operator=(const ConnectionConfig_t &from);
00209 
00210     ~ConnectionConfig_t();
00211     /* @brief getters and setters
00212      */
00213     const std::string &getHost() const;
00214     unsigned short getPort() const;
00215     bool getKeepAlive() const;
00216     int32_t getConnectTimeout() const;
00217     int32_t getReadTimeout() const;
00218     int32_t getWriteTimeout() const;
00219     int32_t getConnectRetriesCount() const;
00220     int32_t getConnectRetryWait() const;
00221 };
00222 
00223 
00227 struct GeoAnchorPoint_t {
00228     GeoAnchorPoint_t() {}
00229     GeoAnchorPoint_t(const std::string &laAtt, const std::string &loAtt,
00230                      float lattitude, float longitude)
00231         : lattitudeAttributeName(laAtt), longitudeAttributeName(loAtt),
00232           lattitude(lattitude), longitude(longitude) {}
00233 
00235     std::string lattitudeAttributeName, longitudeAttributeName;
00237     float lattitude, longitude;
00238 };
00239 
00247 struct SearchConfig_t
00248 {
00251     SearchConfig_t(const SearchConfig_t &from);
00252 
00255     SearchConfig_t &operator= (const SearchConfig_t &from);
00256 
00260     SearchConfig_t(SearchCommandVersion_t cmdVer = VER_COMMAND_SEARCH_2_0_5);
00261 
00265     ~SearchConfig_t();
00266 
00271     void addRangeFilter(const std::string &attrName, uint64_t minValue,
00272                   uint64_t maxValue, bool excludeFlag=false);
00277     void addEnumFilter(const std::string &attrName, const Int64Array_t &values,
00278                   bool excludeFlag=false);
00283     void addEnumFilter(const std::string &attrName, const IntArray_t &values,
00284                   bool excludeFlag=false);
00289     void addFloatRangeFilter(const std::string &attrName, float minValue,
00290                    float maxValue, bool excludeFlag=false);
00291 
00298     void addAttributeOverride(const std::string &attrName,
00299                               AttributeType_t attrType,
00300                               uint64_t docId, const Value_t &value);
00306     void addAttributeOverride(const std::string &attrName,
00307                               AttributeType_t attrType,
00308                               const std::map<uint64_t, Value_t> &values);
00309 
00322     bool getFilter(int index, std::string &attrname,
00323             bool &exclude, float &minValue, float &maxValue) const;
00324 
00337     bool getFilter(int index, std::string &attrname,
00338             bool &exclude, uint64_t &minValue, uint64_t &maxValue) const;
00339 
00351     bool getFilter(int index, std::string &attrname,
00352             bool &exclude, Int64Array_t &values) const;
00353 
00355     unsigned getFilterCount() const;
00356 
00358     const Filter_t *getFilter(int index) const;
00359 
00361     SearchCommandVersion_t getCommandVersion() const;
00362 
00368     void setPaging(uint32_t msgOffset, uint32_t msgLimit);
00369 
00371     void setMatchMode(MatchMode_t matchMode);
00372 
00378     void setSorting(SortMode_t sortMode, const std::string &sortBy = "");
00379 
00385     void setRanking(RankingMode_t rankingMode, const std::string &rankExpr = "");
00386 
00393     void setGrouping(
00394             GroupFunction_t groupFunction,
00395             const std::string &groupBy = "",
00396             const std::string &groupSort = "");
00397 
00399     void setGroupDistinctAttribute(const std::string &attributeName);
00400 
00402     void setMaxMatches(int maxMatches);
00404     void setMaxQueryTime(uint32_t maxQueryTime);
00405 
00407     void setSearchedIndexes(const std::string &indexNames);
00413     void setIndexWeight(const std::string &indexName, uint32_t weight);
00419     void setFieldWeight(const std::string &fieldName, uint32_t weight);
00420 
00422     void setSearchCutoff(uint32_t searchCutOff);
00428     void setRetries(uint32_t distRetryCount, uint32_t distRetryDelay);
00429 
00431     void setGeoAnchorPoints(const std::vector<GeoAnchorPoint_t> &anchorPoints);
00432 
00434     void setQueryComment(const std::string &queryComment);
00435 
00440     void setSelectClause(const std::string &selectClause);
00441 
00442 
00444     uint32_t getPagingOffset() const;
00446     uint32_t getPagingLimit() const;
00447 
00449     MatchMode_t getMatchMode() const;
00450 
00452     SortMode_t getSortingMode() const;
00454     const std::string &getSortingExpr() const;
00455 
00457     RankingMode_t getRankingMode() const;
00459     const std::string &getRankingExpr() const;
00460 
00462     GroupFunction_t getGroupingFunction() const;
00464     const std::string &getGroupByExpr() const;
00466     const std::string &getGroupSortExpr() const;
00468     const std::string &getGroupDistinctAttribute() const;
00469 
00471     int getMaxMatches() const;
00473     uint32_t getMaxQueryTime() const;
00474 
00476     const std::string &getSearchedIndexes() const;
00478     const std::map<std::string, uint32_t> &getIndexWeights() const;
00480     const std::map<std::string, uint32_t> &getFieldWeights() const;
00481 
00483     uint32_t getSearchCutoff() const;
00485     uint32_t getDistRetryCount() const;
00487     uint32_t getDistRetryDelay() const;
00488 
00490     const std::vector<GeoAnchorPoint_t> &getGeoAnchorPoints() const;
00491 
00493     const std::string &getQueryComment() const;
00494 
00496     const std::string &getSelectClause() const;
00497 
00499     typedef std::map<
00500         std::string,
00501         std::pair<AttributeType_t, std::map<uint64_t, Value_t> > >
00502             AttributeOverrides_t;
00503 
00505     const AttributeOverrides_t &getAttributeOverrides() const;
00506 
00507 private:
00508     struct Dptr_t;
00509     Dptr_t *dptr;
00510 };
00511 
00512 // ------------------------ Client_t -----------------------
00513 
00514 // ------------ response data structures --------------
00515 
00521 struct ResponseEntry_t
00522 {
00523     uint64_t documentId;   
00524     uint32_t groupId;      
00525     uint32_t timestamp;    
00526     uint32_t weight;       
00527 
00528     std::map<std::string, Value_t> attribute;
00529 
00530     ResponseEntry_t()
00531         : documentId(0), groupId(0), timestamp(0), weight(0) {}
00532 };//struct
00533 
00539 struct WordStatistics_t
00540 {
00541     uint32_t docsHit;   
00542     uint32_t totalHits; 
00543 };//struct
00544 
00545 typedef std::vector<std::pair<std::string, uint32_t> > AttributeTypes_t;
00546 
00552 struct Response_t
00553 {
00555     std::vector<std::string> field;
00556 
00558     AttributeTypes_t attribute;
00559 
00561     std::vector<ResponseEntry_t> entry;
00562 
00564     std::map<std::string, WordStatistics_t> word;
00565 
00566     // global statistics
00567     uint32_t entriesGot;    
00568     uint32_t entriesFound;  
00569     uint32_t timeConsumed;  
00570     uint32_t use64bitId;    
00571 
00572     SearchCommandVersion_t commandVersion; 
00573 
00574     void clear();
00575 };//struct
00576 
00578 struct KeywordResult_t {
00580     std::string tokenized;
00582     std::string normalized;
00584     WordStatistics_t statistics;
00585 };
00586 
00592 class MultiQuery_t
00593 {
00594 protected:
00595     SearchCommandVersion_t commandVersion;
00596     Query_t queries;
00597     int queryCount;
00598 
00599 public:
00604     MultiQuery_t(SearchCommandVersion_t cmdVersion = VER_COMMAND_SEARCH_0_9_9);
00605 
00607     void initQuery(SearchCommandVersion_t commandVersion);
00608 
00620     void addQuery(const std::string& query, const SearchConfig_t &queryAttr);
00621 
00622     int getQueryCount() const; 
00623     const Query_t &getQueries() const; 
00624 
00625     SearchCommandVersion_t getCommandVersion() const;
00626 };//class
00627 
00628 
00629 // ---------------------------- MultiqueryOpt_t --------------------------------
00630 
00631 /* @brief Sphinx::Query_t with possibility identify whether the query is 
00632  *        composable into multiquery with another query
00633  */
00634 class SourceQuery_t {
00635 public:
00636     /* Constructor
00637      * 
00638      * @param query query string
00639      * @param queryAttr sorting, grouping, etc...
00640      * @param seqNo incoming sequence number of query
00641      */
00642     SourceQuery_t(const std::string &query, const SearchConfig_t &queryAttr,
00643                   int seqNo);
00644     /* @brief compute hash from filters, etc. that must be same within one
00645      *        efficient multiquery
00646      * @return hash
00647      */ 
00648     const std::string &getHash() const {return hash;}
00649     /* @brief get sequence nr. of input query
00650      * @return sequence nr (starting grom 0.)
00651      */
00652     int getInputSeqNo() const {return inputSeqNo;}
00653     /* @brief get encapsulated Sphinx::Query_t
00654      * @return the query
00655      */
00656     const Query_t &getQuery() const {return serializedQuery;}
00657 
00658 private:
00660     Query_t serializedQuery;
00663     std::string hash;
00665     int inputSeqNo;
00666 };
00667 
00668 
00694 class MultiQueryOpt_t
00695 {
00696 public:
00701     MultiQueryOpt_t(SearchCommandVersion_t cmdVersion = VER_COMMAND_SEARCH_0_9_9);
00702 
00703     /* @brief groups input queries into groups efficient for sphinx multiquery
00704      *        processing.
00705      *
00706      * Optimisation is disabled for command version < 0.9.8.
00707      */
00708     void optimise();
00709 
00721     void addQuery(const std::string& query, const SearchConfig_t &queryAttr);
00722 
00723     friend class Sphinx::Client_t;
00724 protected:
00725     SearchCommandVersion_t commandVersion;
00727     std::list<SourceQuery_t> sourceQueries;
00730     std::vector<const SourceQuery_t *> sortedQueries;
00732     std::vector<std::pair<int, int> > responseIndex;
00735     std::vector<int> groupQueries;
00736 
00738     void initQuery(SearchCommandVersion_t commandVersion);
00739 
00740     /* @brief get query group concatenated to one multiquery sphinx request
00741      * @param groupIndex index of query group
00742      * @return multiquery request
00743      */
00744     Sphinx::Query_t getGroupQuery(size_t groupIndex) const;
00745 
00746     /* @brief get count of query groups, that are efficient to process by
00747      *        sphinx multiquery mechanism
00748      * @return query group count
00749      */
00750     size_t getGroupQueryCount() const;
00751 
00753     int getQueryCount() const; 
00754     
00759     size_t getQueryCountAtGroup(size_t groupIndex) const;
00760 
00766     size_t getResponseIndex(size_t sortedIndex) const;
00767 
00769     SearchCommandVersion_t getCommandVersion() const;
00770 };//class
00771 
00772 // ------------------------ Attribute updates ---------------------------
00773 
00777 struct AttributeUpdates_t
00778 {
00779     UpdateCommandVersion_t commandVersion;
00780     std::vector<std::string> attributes;
00781     std::map<uint64_t, std::vector<Value_t> > values;
00782 
00784     AttributeUpdates_t();
00785 
00787     void setAttributeList(const std::vector<std::string> &attr
00788                                              = std::vector<std::string>());
00790     void addAttribute(const std::string &);
00791     
00797     void addDocument(uint64_t id, const std::vector<Value_t> &vals);
00798     
00806     void addDocument(uint64_t id, ValueType_t t, ...);
00807    
00809     void setCommandVersion(UpdateCommandVersion_t v);
00810 };//struct
00811 
00812 
00813 
00814 // ------------ main class --------------
00815 
00822 class Client_t
00823 {
00824 public:
00825     Client_t(const ConnectionConfig_t &connectionSettings);
00826 
00838     void query(const std::string& query,
00839                const SearchConfig_t &queryAttr,
00840                Response_t &response);
00841 
00853     void query(const MultiQuery_t &query, std::vector<Response_t> &response);
00854 
00870     void query(const MultiQueryOpt_t &query, std::vector<Response_t> &response);
00871     
00883     void updateAttributes(const std::string &index, const AttributeUpdates_t &at);
00884 
00895     std::vector<KeywordResult_t> getKeywords(
00896         const std::string &index,
00897         const std::string &query,
00898         bool getWordStatistics = false);
00899 
00900 protected:
00901     // -------- connection settings -----------------
00902     ConnectionConfig_t connection;
00903 };//class
00904 
00905 
00911 std::string escapeQueryString(const std::string &query);
00912 
00913 }//namespace
00914 
00915 extern "C" {
00916     void sphinxClientDummy();
00917 }
00918 
00919 #endif
00920