Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » analysis » [javadoc | source]
    1   package org.apache.lucene.analysis;
    2   
    3   /**
    4    * Licensed to the Apache Software Foundation (ASF) under one or more
    5    * contributor license agreements.  See the NOTICE file distributed with
    6    * this work for additional information regarding copyright ownership.
    7    * The ASF licenses this file to You under the Apache License, Version 2.0
    8    * (the "License"); you may not use this file except in compliance with
    9    * the License.  You may obtain a copy of the License at
   10    *
   11    *     http://www.apache.org/licenses/LICENSE-2.0
   12    *
   13    * Unless required by applicable law or agreed to in writing, software
   14    * distributed under the License is distributed on an "AS IS" BASIS,
   15    * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   16    * See the License for the specific language governing permissions and
   17    * limitations under the License.
   18    */
   19   
   20   import java.io.File;
   21   import java.io.IOException;
   22   import java.io.Reader;
   23   import java.util.Arrays;
   24   import java.util.Set;
   25   import java.util.List;
   26   
   27   import org.apache.lucene.util.Version;
   28   
   29   /** Filters {@link LetterTokenizer} with {@link LowerCaseFilter} and {@link StopFilter}.
   30    *
   31    * <a name="version"/>
   32    * <p>You must specify the required {@link Version}
   33    * compatibility when creating StopAnalyzer:
   34    * <ul>
   35    *   <li> As of 2.9, position increments are preserved
   36    * </ul>
   37   */
   38   
   39   public final class StopAnalyzer extends Analyzer {
   40     private final Set<?> stopWords;
   41     private final boolean enablePositionIncrements;
   42     
   43     /** An unmodifiable set containing some common English words that are not usually useful
   44     for searching.*/
   45     public static final Set<?> ENGLISH_STOP_WORDS_SET;
   46     
   47     static {
   48       final List<String> stopWords = Arrays.asList(
   49         "a", "an", "and", "are", "as", "at", "be", "but", "by",
   50         "for", "if", "in", "into", "is", "it",
   51         "no", "not", "of", "on", "or", "such",
   52         "that", "the", "their", "then", "there", "these",
   53         "they", "this", "to", "was", "will", "with"
   54       );
   55       final CharArraySet stopSet = new CharArraySet(stopWords.size(), false);
   56       stopSet.addAll(stopWords);  
   57       ENGLISH_STOP_WORDS_SET = CharArraySet.unmodifiableSet(stopSet); 
   58     }
   59     
   60     /** Builds an analyzer which removes words in
   61      *  {@link #ENGLISH_STOP_WORDS_SET}.
   62      * @param matchVersion See <a href="#version">above</a>
   63      */
   64     public StopAnalyzer(Version matchVersion) {
   65       stopWords = ENGLISH_STOP_WORDS_SET;
   66       enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
   67     }
   68   
   69     /** Builds an analyzer with the stop words from the given set.
   70      * @param matchVersion See <a href="#version">above</a>
   71      * @param stopWords Set of stop words */
   72     public StopAnalyzer(Version matchVersion, Set<?> stopWords) {
   73       this.stopWords = stopWords;
   74       enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
   75     }
   76   
   77     /** Builds an analyzer with the stop words from the given file.
   78      * @see WordlistLoader#getWordSet(File)
   79      * @param matchVersion See <a href="#version">above</a>
   80      * @param stopwordsFile File to load stop words from */
   81     public StopAnalyzer(Version matchVersion, File stopwordsFile) throws IOException {
   82       stopWords = WordlistLoader.getWordSet(stopwordsFile);
   83       this.enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
   84     }
   85   
   86     /** Builds an analyzer with the stop words from the given reader.
   87      * @see WordlistLoader#getWordSet(Reader)
   88      * @param matchVersion See <a href="#version">above</a>
   89      * @param stopwords Reader to load stop words from */
   90     public StopAnalyzer(Version matchVersion, Reader stopwords) throws IOException {
   91       stopWords = WordlistLoader.getWordSet(stopwords);
   92       this.enablePositionIncrements = StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion);
   93     }
   94   
   95     /** Filters LowerCaseTokenizer with StopFilter. */
   96     @Override
   97     public TokenStream tokenStream(String fieldName, Reader reader) {
   98       return new StopFilter(enablePositionIncrements, new LowerCaseTokenizer(reader), stopWords);
   99     }
  100   
  101     /** Filters LowerCaseTokenizer with StopFilter. */
  102     private class SavedStreams {
  103       Tokenizer source;
  104       TokenStream result;
  105     };
  106     @Override
  107     public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
  108       SavedStreams streams = (SavedStreams) getPreviousTokenStream();
  109       if (streams == null) {
  110         streams = new SavedStreams();
  111         streams.source = new LowerCaseTokenizer(reader);
  112         streams.result = new StopFilter(enablePositionIncrements, streams.source, stopWords);
  113         setPreviousTokenStream(streams);
  114       } else
  115         streams.source.reset(reader);
  116       return streams.result;
  117     }
  118   }
  119   

Save This Page
Home » lucene-3.0.1-src » org.apache » lucene » analysis » [javadoc | source]