diff --git a/pom.xml b/pom.xml index b64ce66..329a767 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 com.yakaz.elasticsearch.plugins elasticsearch-analysis-combo - 1.5.2-SNAPSHOT + 2.1.1-SNAPSHOT jar 2011 @@ -38,8 +38,9 @@ - 1.0.0.RC1 - 4.6.0 + 2.1.1 + 5.3.1 + 1.7 @@ -75,6 +76,19 @@ test + + junit + junit + 4.11 + + + hamcrest-core + org.hamcrest + + + test + + log4j log4j @@ -117,8 +131,8 @@ maven-compiler-plugin 2.3.2 - 1.6 - 1.6 + ${mvn.java.version} + ${mvn.java.version} diff --git a/src/main/assemblies/plugin.xml b/src/main/assemblies/plugin.xml index 9e3ccb3..2e041df 100644 --- a/src/main/assemblies/plugin.xml +++ b/src/main/assemblies/plugin.xml @@ -5,6 +5,13 @@ zip false + + + src/main/resources/plugin-descriptor.properties + + true + + / diff --git a/src/main/java/org/apache/lucene/analysis/ComboAnalyzer.java b/src/main/java/org/apache/lucene/analysis/ComboAnalyzer.java index 86678c3..a94c729 100644 --- a/src/main/java/org/apache/lucene/analysis/ComboAnalyzer.java +++ b/src/main/java/org/apache/lucene/analysis/ComboAnalyzer.java @@ -18,18 +18,14 @@ package org.apache.lucene.analysis; import org.apache.lucene.analysis.miscellaneous.UniqueTokenFilter; -import org.apache.lucene.util.CloseableThreadLocal; import org.apache.lucene.util.ReaderCloneFactory; -import org.apache.lucene.util.Version; -import org.elasticsearch.common.logging.ESLogger; -import org.elasticsearch.common.logging.ESLoggerFactory; import java.io.IOException; import java.io.Reader; -import java.util.Arrays; +import java.util.HashMap; import java.util.HashSet; +import java.util.Map; import java.util.Set; -import java.util.concurrent.atomic.AtomicReference; /** * An analyzer that combines multiple sub-analyzers into one. @@ -50,8 +46,6 @@ */ public class ComboAnalyzer extends Analyzer { - protected static final ESLogger logger = ESLoggerFactory.getLogger(ComboAnalyzer.class.getSimpleName()); - /** * Default value for the enabled state of {@link TokenStream} caching. */ @@ -71,13 +65,8 @@ public class ComboAnalyzer extends Analyzer { private boolean deduplication = DEDUPLICATION_ENABLED_DEFAULT; - private CloseableThreadLocal lastTokenStreams = new CloseableThreadLocal(); - private CloseableThreadLocal tempTokenStreams = new CloseableThreadLocal(); - private CloseableThreadLocal lastComboTokenStream = new CloseableThreadLocal(); - - public ComboAnalyzer(Version version, Analyzer... subAnalyzers) { - super(new GlobalReuseStrategy()); - + public ComboAnalyzer(Analyzer... subAnalyzers) { + super(); this.subAnalyzers = subAnalyzers; // Detect duplicates in analyzers @@ -168,113 +157,109 @@ public ComboAnalyzer disableDeduplication() { return this; } - protected ReaderCloneFactory.ReaderCloner cloneReader(Reader originalReader) { - ReaderCloneFactory.ReaderCloner rtn; + private static Tokenizer DUMMY_TOKENIZER = new Tokenizer(){ + @Override + public boolean incrementToken() throws IOException { + return false; + } + }; - // Duplication of the original reader, to feed all sub-analyzers - if (subAnalyzers.length <= 1) { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + return new CombiningTokenStreamComponents(fieldName); + } - // Can reuse the only reader we have, there will be no need of duplication - // Usage of the AtomicReference ensures that the same reader won't be duplicated. - ReaderCloneFactory.ReaderCloner useOnceReaderCloner = new ReaderCloneFactory.ReaderCloner() { - private AtomicReference singleUsageReference = null; - public void init(Reader originalReader) throws IOException { - singleUsageReference = new AtomicReference(originalReader); - } - public Reader giveAClone() { - return singleUsageReference.getAndSet(null); - } - }; - try { - useOnceReaderCloner.init(originalReader); - } catch (Throwable fail) { - useOnceReaderCloner = null; - } - rtn = useOnceReaderCloner; + @Override public void close() { + super.close(); + } - } else { + private class CombiningTokenStreamComponents extends TokenStreamComponents { - rtn = ReaderCloneFactory.getCloner(originalReader); // internally uses the default "should always work" implementation + private final Map duplicateAnalyzers = new HashMap(); + private final String field; + private Reader reader; + public CombiningTokenStreamComponents(String field) { + super(DUMMY_TOKENIZER); + this.field = field; } - if (rtn == null) { - throw new IllegalArgumentException("Could not duplicate the original reader to feed multiple sub-readers"); + @Override + public void setReader(Reader reader) throws IOException { + duplicateAnalyzers.clear(); + this.reader = reader; } - return rtn; - } - @Override - protected TokenStreamComponents createComponents(String fieldName, Reader originalReader) { - // Duplication of the original reader, to feed all sub-analyzers - ReaderCloneFactory.ReaderCloner readerCloner = cloneReader(originalReader); - - // We remember last used TokenStreams because many times Analyzers can provide a reusable TokenStream - // Detecting that all sub-TokenStreams are reusable permits to reuse our ComboTokenStream as well. - if (tempTokenStreams.get() == null) tempTokenStreams.set(new TokenStream[subAnalyzers.length]); // each time non reusability has been detected - if (lastTokenStreams.get() == null) lastTokenStreams.set(new TokenStream[subAnalyzers.length]); // only at first run - TokenStream[] tempTokenStreams_local = tempTokenStreams.get(); - TokenStream[] lastTokenStreams_local = lastTokenStreams.get(); - ReusableTokenStreamComponents lastComboTokenStream_local = lastComboTokenStream.get(); - if (lastComboTokenStream_local == null) - lastComboTokenStream_local = new ReusableTokenStreamComponents(fieldName, this); + @Override + public TokenStream getTokenStream() { + TokenStream ret = createTokenStreams(); + return deduplication ? new UniqueTokenFilter(ret): ret; + } - // Get sub-TokenStreams from sub-analyzers - for (int i = subAnalyzers.length-1 ; i >= 0 ; --i) { + private TokenStream createTokenStreams() { + if(subAnalyzers.length == 1){ + return createTokenStream(subAnalyzers[0], field, reader); + } + else{ + ReaderCloneFactory.ReaderCloner cloner = ReaderCloneFactory.getCloner(reader); + TokenStream[] streams = new TokenStream[subAnalyzers.length]; + for (int i = 0; i < subAnalyzers.length; i++) { + streams[i] = createTokenStream(subAnalyzers[i], field, cloner.giveAClone()); + } + return new ComboTokenStream(streams); + } + } - // Feed the troll - Reader reader = readerCloner.giveAClone(); - tempTokenStreams_local[i] = null; + private TokenStream createTokenStream(Analyzer analyzer, String field, Reader reader) { try { - tempTokenStreams_local[i] = subAnalyzers[i].tokenStream(fieldName, reader); - } catch (IOException ignored) { - logger.debug("Ignoring {}th analyzer [{}]. Could not get a TokenStream.", ignored, i, subAnalyzers[i]); - } - // Use caching if asked or if required in case of duplicated analyzers - if (cacheTokenStreams || hasDuplicatedAnalyzers && duplicatedAnalyzers.contains(subAnalyzers[i])) { - CachingTokenStream cache = new CachingTokenStream(tempTokenStreams_local[i]); - try { - tempTokenStreams_local[i].reset(); - cache.fillCache(); - } catch (IOException ignored) { - logger.debug("Got an error when caching TokenStream from the {}th analyzer [{}]", i, subAnalyzers[i]); + if(hasDuplicatedAnalyzers && duplicatedAnalyzers.contains(analyzer)) { + return createCachedCopies(analyzer, field, reader); + } + else if(cacheTokenStreams){ + return loadAndClose(analyzer.tokenStream(field, reader)); } - try { - // Close original stream, all tokens are buffered - tempTokenStreams_local[i].close(); - } catch (IOException ignored) { - logger.debug("Got an error when closing TokenStream from the {}th analyzer [{}]", i, subAnalyzers[i]); + else{ + return analyzer.tokenStream(field, reader); } - tempTokenStreams_local[i] = cache; + } catch (IOException e) { + throw new RuntimeException(e); } - // Detect non reusability - if (tempTokenStreams_local[i] != lastTokenStreams_local[i]) { - lastComboTokenStream_local.setTokenStream(null); + } + + private TokenStream createCachedCopies(Analyzer analyzer, String field ,Reader reader) throws IOException { + //First time we see this analyzer, means that we have to cache the content + if(!duplicateAnalyzers.containsKey(analyzer)){ + CachingTokenStream caching = loadAndClose(analyzer.tokenStream(field, reader)); + duplicateAnalyzers.put(analyzer, caching); + return caching; + } + else{ + //Already seen, can just create a new copy of the cached + return loadAsCaching(duplicateAnalyzers.get(analyzer)); } } - // If last ComboTokenStream is not available create a new one - // This happens in the first call and in case of non reusability - if (lastComboTokenStream_local.getTokenStream() == null) { - // Clear old invalid references (preferred over allocating a new array) - Arrays.fill(lastTokenStreams_local, null); - // Swap temporary and last (non reusable) TokenStream references - lastTokenStreams.set(tempTokenStreams_local); - tempTokenStreams.set(lastTokenStreams_local); - // New ComboTokenStream to use - lastComboTokenStream_local.setTokenStream(new ComboTokenStream(tempTokenStreams_local)); - if (deduplication) - lastComboTokenStream_local.setTokenStream(new UniqueTokenFilter(lastComboTokenStream_local.getTokenStream(), true)); - lastComboTokenStream.set(lastComboTokenStream_local); + private CachingTokenStream loadAndClose(TokenStream tokenStream) { + CachingTokenStream cache = loadAsCaching(tokenStream); + try{ + tokenStream.close(); + } + catch (IOException e){ + throw new RuntimeException(e); + } + return cache; } - return lastComboTokenStream_local; - } - @Override public void close() { - super.close(); - lastTokenStreams.close(); - tempTokenStreams.close(); - lastComboTokenStream.close(); + private CachingTokenStream loadAsCaching(TokenStream tokenStream) { + try{ + CachingTokenStream cachingTokenStream = new CachingTokenStream(tokenStream); + tokenStream.reset(); + cachingTokenStream.fillCache(); + return cachingTokenStream; + } + catch (Exception e){ + throw new RuntimeException(e); + } + } } - } diff --git a/src/main/java/org/apache/lucene/analysis/ComboAnalyzerWrapper.java b/src/main/java/org/apache/lucene/analysis/ComboAnalyzerWrapper.java index 7fe2400..725ce11 100644 --- a/src/main/java/org/apache/lucene/analysis/ComboAnalyzerWrapper.java +++ b/src/main/java/org/apache/lucene/analysis/ComboAnalyzerWrapper.java @@ -20,7 +20,6 @@ package org.apache.lucene.analysis; import org.apache.lucene.util.Version; -import org.elasticsearch.ElasticsearchIllegalArgumentException; import org.elasticsearch.common.inject.Injector; import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.logging.ESLoggerFactory; @@ -72,7 +71,7 @@ protected void init() { String[] sub = settings.getAsArray("sub_analyzers"); ArrayList subAnalyzers = new ArrayList(); if (sub == null) { - throw new ElasticsearchIllegalArgumentException("Analyzer ["+name+"] analyzer of type ["+NAME+"], must have a \"sub_analyzers\" list property"); + throw new IllegalArgumentException("Analyzer ["+name+"] analyzer of type ["+NAME+"], must have a \"sub_analyzers\" list property"); } for (String subname : sub) { @@ -84,7 +83,7 @@ protected void init() { } } - this.analyzer = new org.apache.lucene.analysis.ComboAnalyzer(version, subAnalyzers.toArray(new Analyzer[subAnalyzers.size()])); + this.analyzer = new org.apache.lucene.analysis.ComboAnalyzer(subAnalyzers.toArray(new Analyzer[subAnalyzers.size()])); Boolean tokenstreamCaching = settings.getAsBoolean("tokenstream_caching", null); if (tokenstreamCaching != null) @@ -96,9 +95,9 @@ protected void init() { } @Override - protected TokenStreamComponents createComponents(String fieldName, Reader reader) { + protected TokenStreamComponents createComponents(String fieldName) { if (analyzer == null) init(); - return this.analyzer.createComponents(fieldName, reader); + return this.analyzer.createComponents(fieldName); } @Override public void close() { diff --git a/src/main/java/org/apache/lucene/analysis/ReusableTokenStreamComponents.java b/src/main/java/org/apache/lucene/analysis/ReusableTokenStreamComponents.java deleted file mode 100644 index 7ee38b4..0000000 --- a/src/main/java/org/apache/lucene/analysis/ReusableTokenStreamComponents.java +++ /dev/null @@ -1,68 +0,0 @@ -package org.apache.lucene.analysis; - -import java.io.IOException; -import java.io.Reader; - -public class ReusableTokenStreamComponents extends Analyzer.TokenStreamComponents { - - protected TokenStream sink; - protected final String fieldName; - protected final ComboAnalyzer analyzer; - - public ReusableTokenStreamComponents(String fieldName, ComboAnalyzer analyzer) { - super(DummyTokenizer.INSTANCE); - this.fieldName = fieldName; - this.analyzer = analyzer; - } - - public void setTokenStream(TokenStream sink) { - this.sink = sink; - } - - @Override - protected void setReader(Reader reader) throws IOException { - // This ReusableTokenStreamComponents comes from a ReuseStrategy, - // which uses a ThreadLocal, hence the ComboAnalyzer will reuse - // this instance and make it ready. - analyzer.createComponents(fieldName, reader); - } - - @Override - public TokenStream getTokenStream() { - return sink; - } - - protected static final class DummyTokenizer extends Tokenizer { - - public static final DummyTokenizer INSTANCE = new DummyTokenizer(); - - public DummyTokenizer() { - super(DummyReader.INSTANCE); - } - - @Override - public boolean incrementToken() throws IOException { - return false; - } - - } - - protected static class DummyReader extends Reader { - - public static final DummyReader INSTANCE = new DummyReader(); - - public DummyReader() { - } - - @Override - public int read(char[] cbuf, int off, int len) throws IOException { - return 0; - } - - @Override - public void close() throws IOException { - } - - } - -} diff --git a/src/main/java/org/apache/lucene/util/ReaderCloneFactory.java b/src/main/java/org/apache/lucene/util/ReaderCloneFactory.java index 5869ee6..1a1d6f5 100644 --- a/src/main/java/org/apache/lucene/util/ReaderCloneFactory.java +++ b/src/main/java/org/apache/lucene/util/ReaderCloneFactory.java @@ -17,11 +17,9 @@ package org.apache.lucene.util; -import org.apache.lucene.analysis.ReusableStringReaderCloner; import org.elasticsearch.common.logging.ESLogger; import org.elasticsearch.common.logging.ESLoggerFactory; -import javax.io.StringReaderCloner; import java.io.BufferedReader; import java.io.CharArrayReader; import java.io.FilterReader; @@ -43,7 +41,7 @@ * that merely reads all the available content, and creates a String out of it. * * Therefore you should understand the importance of having a proper implementation for - * any optimizable {@link Reader}. For instance, {@link javax.io.StringReaderCloner} gains access + * any optimizable {@link Reader}. For instance, {@link StringReaderCloner} gains access * to the underlying String in order to avoid copies. A generic BufferedReader */ public class ReaderCloneFactory { diff --git a/src/main/java/org/apache/lucene/analysis/ReusableStringReaderCloner.java b/src/main/java/org/apache/lucene/util/ReusableStringReaderCloner.java similarity index 81% rename from src/main/java/org/apache/lucene/analysis/ReusableStringReaderCloner.java rename to src/main/java/org/apache/lucene/util/ReusableStringReaderCloner.java index e7dfaf0..c67afff 100644 --- a/src/main/java/org/apache/lucene/analysis/ReusableStringReaderCloner.java +++ b/src/main/java/org/apache/lucene/util/ReusableStringReaderCloner.java @@ -17,9 +17,8 @@ * under the License. */ -package org.apache.lucene.analysis; +package org.apache.lucene.util; -import org.apache.lucene.util.ReaderCloneFactory; import java.io.IOException; import java.io.Reader; @@ -35,16 +34,18 @@ * private field {@code String s}, storing the original content. * It is therefore sensitive to Lucene implementation changes. */ -public class ReusableStringReaderCloner implements ReaderCloneFactory.ReaderCloner { +public class ReusableStringReaderCloner implements ReaderCloneFactory.ReaderCloner { private static java.lang.reflect.Field internalField; + private static Class reusableStringReader; - private ReusableStringReader original; + private Reader original; private String originalContent; static { try { - internalField = ReusableStringReader.class.getDeclaredField("s"); + reusableStringReader = (Class) ReusableStringReaderCloner.class.getClassLoader().loadClass("org.apache.lucene.analysis.ReusableStringReader"); + internalField = reusableStringReader.getDeclaredField("s"); internalField.setAccessible(true); } catch (Exception ex) { throw new IllegalArgumentException("Could not give accessibility to private \"str\" field of the given StringReader", ex); @@ -52,17 +53,17 @@ public class ReusableStringReaderCloner implements ReaderCloneFactory.ReaderClon } /** - * Binds this ReaderCloner with the package-private {@link ReusableStringReader} class + * Binds this ReaderCloner with the package-private ReusableStringReader class * into the {@link ReaderCloneFactory}, without giving access to the hidden class. */ public static void registerCloner() { - ReaderCloneFactory.bindCloner(ReusableStringReader.class, ReusableStringReaderCloner.class); + ReaderCloneFactory.bindCloner(reusableStringReader, ReusableStringReaderCloner.class); } /** * @param originalReader Must pass the canHandleReader(Reader) test, otherwise an IllegalArgumentException will be thrown. */ - public void init(ReusableStringReader originalReader) throws IOException { + public void init(Reader originalReader) throws IOException { this.original = originalReader; this.originalContent = null; try { diff --git a/src/main/java/javax/io/StringReaderCloner.java b/src/main/java/org/apache/lucene/util/StringReaderCloner.java similarity index 97% rename from src/main/java/javax/io/StringReaderCloner.java rename to src/main/java/org/apache/lucene/util/StringReaderCloner.java index 1f9d0fa..a7e6a04 100644 --- a/src/main/java/javax/io/StringReaderCloner.java +++ b/src/main/java/org/apache/lucene/util/StringReaderCloner.java @@ -18,9 +18,7 @@ */ // Using javax instead of java because of JVM security measures! -package javax.io; - -import org.apache.lucene.util.ReaderCloneFactory; +package org.apache.lucene.util; import java.io.IOException; import java.io.Reader; diff --git a/src/main/java/org/elasticsearch/index/analysis/ComboAnalysisBinderProcessor.java b/src/main/java/org/elasticsearch/index/analysis/ComboAnalysisBinderProcessor.java deleted file mode 100644 index 7859f60..0000000 --- a/src/main/java/org/elasticsearch/index/analysis/ComboAnalysisBinderProcessor.java +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Licensed to Elastic Search and Shay Banon under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. Elastic Search licenses this - * file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.elasticsearch.index.analysis; - -import org.apache.lucene.analysis.ComboAnalyzerWrapper; - -public class ComboAnalysisBinderProcessor extends AnalysisModule.AnalysisBinderProcessor { - - @Override public void processAnalyzers(AnalyzersBindings analyzersBindings) { - analyzersBindings.processAnalyzer(ComboAnalyzerWrapper.NAME, ComboAnalyzerProvider.class); - } - -} diff --git a/src/main/java/org/elasticsearch/index/analysis/ComboAnalyzerProvider.java b/src/main/java/org/elasticsearch/index/analysis/ComboAnalyzerProvider.java index 927946d..5d8e9ff 100644 --- a/src/main/java/org/elasticsearch/index/analysis/ComboAnalyzerProvider.java +++ b/src/main/java/org/elasticsearch/index/analysis/ComboAnalyzerProvider.java @@ -24,8 +24,9 @@ import org.elasticsearch.common.inject.Injector; import org.elasticsearch.common.inject.assistedinject.Assisted; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.env.Environment; import org.elasticsearch.index.Index; -import org.elasticsearch.index.settings.IndexSettings; +import org.elasticsearch.index.settings.IndexSettingsService; public class ComboAnalyzerProvider extends AbstractIndexAnalyzerProvider { @@ -33,8 +34,9 @@ public class ComboAnalyzerProvider extends AbstractIndexAnalyzerProvider> nodePlugins() { + return pluginList(AnalysisComboPlugin.class); + } + + protected Settings nodeSettings(int nodeOrdinal) { + Settings.Builder settings = Settings.builder() + .put(super.nodeSettings(nodeOrdinal)); + return settings.build(); + } + + protected void assertAnalyzesTo(String analyzer, String input, String[] output, int startOffsets[], int endOffsets[], String types[], int position[]) { assertThat(output, notNullValue()); - AnalyzeResponse response = client().admin().indices().analyze(new AnalyzeRequest(INDEX, input).analyzer(analyzer)).actionGet(); + AnalyzeResponse response = client().admin().indices().analyze(new AnalyzeRequest(INDEX).text(input).analyzer(analyzer)).actionGet(); if (VERBOSE) { try { Map params = new HashMap(); @@ -36,7 +51,6 @@ protected void assertAnalyzesTo(String analyzer, String input, String[] output, } } Iterator tokens = response.iterator(); - int pos = 0; for (int i = 0; i < output.length; i++) { assertTrue("token "+i+" does not exist", tokens.hasNext()); AnalyzeResponse.AnalyzeToken token = tokens.next(); @@ -47,9 +61,8 @@ protected void assertAnalyzesTo(String analyzer, String input, String[] output, assertThat("endOffset "+i, token.getEndOffset(), equalTo(endOffsets[i])); if (types != null) assertThat("type "+i, token.getType(), equalTo(types[i])); - if (posIncrements != null) { - pos += posIncrements[i]; - assertThat("position "+i, token.getPosition(), equalTo(pos)); + if (position != null) { + assertThat("position "+i, token.getPosition(), equalTo(position[i])); } } } @@ -84,7 +97,7 @@ public void testAnalysis() throws IOException { new int[]{ 0, 0, 0, 5, 7, 7, 14, 14}, new int[]{ 4, 4, 18, 6, 13, 13, 18, 18}, null, - new int[]{ 1, 0, 0, 1, 1, 0, 1, 0}); + new int[]{ 0, 0, 0, 1, 2, 2, 3, 3}); } }