Source code

001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp.namedentity;
031
032import java.io.IOException;
033import java.util.HashMap;
034import java.util.List;
035
036import org.apache.lucene.analysis.Analyzer;
037import org.apache.lucene.document.Document;
038import org.apache.lucene.index.CorruptIndexException;
039import org.apache.lucene.index.DirectoryReader;
040import org.apache.lucene.index.Term;
041import org.apache.lucene.queries.TermsFilter;
042import org.apache.lucene.queryparser.classic.ParseException;
043import org.apache.lucene.queryparser.classic.QueryParser;
044import org.apache.lucene.search.IndexSearcher;
045import org.apache.lucene.search.Query;
046import org.apache.lucene.search.ScoreDoc;
047import org.apache.lucene.search.TopScoreDocCollector;
048import org.apache.lucene.store.Directory;
049import org.apache.lucene.util.Version;
050
051/**
052 * Given a lucene {@link Directory} index and an {@link Analyzer} allow for
053 * searches of particular fields.
054 * 
055 * @author Laurence Willmore (lgw1e10@ecs.soton.ac.uk)
056 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
057 * 
058 */
059public class QuickSearcher {
060
061        IndexSearcher searcher;
062        Analyzer analyser;
063
064        /**
065         * the index to search and the analyser to use to process queries
066         * 
067         * @param index
068         * @param analyser
069         */
070        public QuickSearcher(Directory index, Analyzer analyser) {
071                try {
072                        final DirectoryReader reader = DirectoryReader.open(index);
073                        searcher = new IndexSearcher(reader);
074                } catch (final CorruptIndexException e) {
075                        e.printStackTrace();
076                        System.exit(1);
077                } catch (final IOException e) {
078                        e.printStackTrace();
079                        System.exit(1);
080                }
081                this.analyser = analyser;
082
083        }
084
085        /**
086         * Given a search field to search,the name of the field to return results in
087         * and a query string, return search results up to the limit.
088         * 
089         * @param searchfieldName
090         * @param returnFieldName
091         * @param queryStr
092         * @param limit
093         * @return search results (with confidences)
094         * @throws ParseException
095         * @throws IOException
096         */
097        public HashMap<String[], Float> search(String searchfieldName,
098                        String[] returnFieldName, String queryStr, int limit)
099                        throws ParseException, IOException {
100                if (queryStr == null || queryStr.length() == 0)
101                        return new HashMap<String[], Float>();
102                final String clean = QueryParser.escape(queryStr);
103                final Query q = new QueryParser(Version.LUCENE_40, searchfieldName,
104                                analyser).parse(clean);
105                final TopScoreDocCollector collector = TopScoreDocCollector.create(
106                                limit, true);
107
108                searcher.search(q, collector);
109                final ScoreDoc[] hits = collector.topDocs().scoreDocs;
110                final HashMap<String[], Float> results = new HashMap<String[], Float>();
111                for (int i = 0; i < hits.length; ++i) {
112                        final int docId = hits[i].doc;
113                        final Document d = searcher.doc(docId);
114                        String[] rvalues = new String[returnFieldName.length];
115                        for(int j=0;j<rvalues.length;j++){
116                                rvalues[j]=d.get(returnFieldName[j]);
117                        }
118                        results.put(rvalues, hits[i].score);
119                }
120                return results;
121        }
122
123        /**
124         * Given a list of values for the filterField, this method will return the
125         * scores of a search for the documents which satisfy one of those filter
126         * values.
127         * 
128         * @see #search(String, String[], String, int)
129         * @param searchfieldName = Name of the field to search
130         * @param returnFieldName = Name of the Field to return
131         * @param queryStr = String that should be used to search
132         * @param filterFieldName = Name of field to filter on
133         * @param filterQueries = Values of the filterField. Only documents with one of these values will be returned.
134         * @return same as the other search
135         */
136        public HashMap<String[], Float> searchFiltered(String searchfieldName,
137                        String[] returnFieldName, String queryStr, String filterFieldName,
138                        List<String> filterQueries) {
139                if (queryStr == null || queryStr.length() == 0)
140                        return new HashMap<String[], Float>();
141                HashMap<String[], Float> results = new HashMap<String[], Float>();
142                //Make the query a filter
143                TermsFilter qf = new TermsFilter();
144                for (String filterValue : filterQueries) {
145                        qf.addTerm(new Term(filterFieldName, filterValue));
146                }
147                final String clean = QueryParser.escape(queryStr);
148                Query q = null;
149                try {
150                        q = new QueryParser(Version.LUCENE_40, searchfieldName, analyser)
151                                        .parse(clean);
152                } catch (final ParseException e) {      
153                        e.printStackTrace();
154                }
155                try {                   
156                        final ScoreDoc[] hits = searcher.search(q, qf, filterQueries.size()).scoreDocs;
157                        for (int i = 0; i < hits.length; ++i) {
158                                final int docId = hits[i].doc;
159                                final Document d = searcher.doc(docId);
160                                String[] rvalues = new String[returnFieldName.length];
161                                for(int j=0;j<rvalues.length;j++){
162                                        rvalues[j]=d.get(returnFieldName[j]);
163                                }
164                                results.put(rvalues, hits[i].score);
165                        }
166                } catch (final IOException e) {                 
167                        e.printStackTrace();
168                }
169                return results;
170                //TODO: Scores of 0 are not returned by the lucene searcher. Need to fill in the 0's and also have fallback disambiguation
171        }
172
173}