commit fccbb79c1ae504089a5158749d45268fd356a023
parent 540cd77a71ff85c8d813407171b9525e8fa8994b
Author: Dan Callaghan <djc@djc.id.au>
Date: Sat, 19 Mar 2011 15:48:42 +1000
upgrade to Lucene 3.0
Diffstat:
9 files changed, 317 insertions(+), 297 deletions(-)
diff --git a/pom.xml b/pom.xml
@@ -65,17 +65,17 @@
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
- <version>4.8.1</version>
+ <version>4.8.2</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
- <version>2.4.0</version>
+ <version>3.0.2</version>
</dependency>
<dependency>
<groupId>org.easymock</groupId>
- <artifactId>easymockclassextension</artifactId>
+ <artifactId>easymock</artifactId>
<version>2.5.2</version>
</dependency>
</dependencies>
diff --git a/src/main/java/au/com/miskinhill/search/analysis/Analyzer.java b/src/main/java/au/com/miskinhill/search/analysis/Analyzer.java
@@ -0,0 +1,20 @@
+package au.com.miskinhill.search.analysis;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * Same as {@link org.apache.lucene.analysis.Analyzer Lucene's Analyzer} but
+ * with a saner API.
+ */
+public interface Analyzer {
+
+ TokenStream tokenizer(Reader input);
+
+ TokenStream tokenizer(AttributeSource attributeSource, Reader input);
+
+ TokenStream applyFilters(TokenStream input);
+
+}
diff --git a/src/main/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilter.java b/src/main/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilter.java
@@ -1,46 +1,57 @@
package au.com.miskinhill.search.analysis;
import java.io.IOException;
+import java.nio.CharBuffer;
import java.util.HashMap;
import java.util.Map;
+import java.util.regex.Pattern;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
/**
* Assumes that tokens have already been lower-cased.
*/
public class CyrillicTransliteratingFilter extends TokenFilter {
- private static final String CYRILLIC_PATTERN = ".*[а-я]+.*";
-
- private Token transliterated = null;
+ private static final Pattern CYRILLIC_PATTERN = Pattern.compile("[а-я]+");
+
+ private final TermAttribute termAttribute;
+ private final PositionIncrementAttribute posIncAttribute;
+ private String transliterated = null;
+ private State transliteratedState = null;
protected CyrillicTransliteratingFilter(TokenStream input) {
super(input);
+ this.termAttribute = addAttribute(TermAttribute.class);
+ this.posIncAttribute = addAttribute(PositionIncrementAttribute.class);
}
@Override
- public Token next(Token reusableToken) throws IOException {
- Token tok;
+ public boolean incrementToken() throws IOException {
if (transliterated == null) {
- tok = input.next(reusableToken);
- if (tok == null) return null;
- if (needsTransliterating(tok.term())) {
- transliterated = (Token) tok.clone();
- transliterated.setTermBuffer(transliterate(transliterated.term()));
- transliterated.setPositionIncrement(0);
+ if (!input.incrementToken())
+ return false;
+ CharSequence text = CharBuffer.wrap(termAttribute.termBuffer(),
+ 0, termAttribute.termLength());
+ if (needsTransliterating(text)) {
+ transliterated = transliterate(text);
+ transliteratedState = captureState();
}
} else {
- tok = transliterated;
+ restoreState(transliteratedState);
+ termAttribute.setTermBuffer(transliterated);
+ posIncAttribute.setPositionIncrement(0);
transliterated = null;
+ transliteratedState = null;
}
- return tok;
+ return true;
}
- private static boolean needsTransliterating(String text) {
- return (text.matches(CYRILLIC_PATTERN));
+ private static boolean needsTransliterating(CharSequence text) {
+ return (CYRILLIC_PATTERN.matcher(text).find());
}
private static final Map<Character, String> TRANSLITERATION_TABLE = new HashMap<Character, String>();
diff --git a/src/main/java/au/com/miskinhill/search/analysis/OffsetTokenFilter.java b/src/main/java/au/com/miskinhill/search/analysis/OffsetTokenFilter.java
@@ -2,27 +2,32 @@ package au.com.miskinhill.search.analysis;
import java.io.IOException;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
public class OffsetTokenFilter extends TokenFilter {
-
+
+ private final OffsetAttribute offsetAttribute;
private int offset;
protected OffsetTokenFilter(TokenStream input, int offset) {
super(input);
this.offset = offset;
+ this.offsetAttribute = addAttribute(OffsetAttribute.class);
}
- @Override
- public Token next(Token reusableToken) throws IOException {
- Token retval = input.next(reusableToken);
- if (retval != null && offset != 0) {
- retval.setStartOffset(retval.startOffset() + offset);
- retval.setEndOffset(retval.endOffset() + offset);
- }
- return retval;
- }
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (input.incrementToken()) {
+ if (offset != 0) {
+ offsetAttribute.setOffset(offsetAttribute.startOffset() + offset,
+ offsetAttribute.endOffset() + offset);
+ }
+ return true;
+ } else {
+ return false;
+ }
+ }
}
diff --git a/src/main/java/au/com/miskinhill/search/analysis/PerLanguageAnalyzerMap.java b/src/main/java/au/com/miskinhill/search/analysis/PerLanguageAnalyzerMap.java
@@ -0,0 +1,56 @@
+package au.com.miskinhill.search.analysis;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+/**
+ * Returns an analyzer according to based on the language of the text
+ * being analysed. The default sub-analyzer is given in the constructor; this is
+ * used when the language is not specified, or when a language is specified for
+ * which we have no specific sub-analyzer. Use
+ * {@link #addAnalyzer(String, Analyzer)} to add a sub-analyzer for a specific
+ * language.
+ * <p>
+ * Note that languages are matched by prefix, so that if a sub-analyzer has been
+ * added for "en" (but not "en-AU"), it will be returned for "en-AU".
+ */
+public class PerLanguageAnalyzerMap {
+
+ private static final Logger LOG = Logger.getLogger(PerLanguageAnalyzerMap.class.getName());
+
+ protected Trie<Analyzer> analyzers;
+ private List<Analyzer> analyzersList = new ArrayList<Analyzer>(); // easier than traversing the trie
+
+ public PerLanguageAnalyzerMap(Analyzer defaultAnalyzer) {
+ analyzers = new Trie<Analyzer>(defaultAnalyzer);
+ analyzersList.add(defaultAnalyzer);
+ }
+
+ public void addAnalyzer(String language, Analyzer analyzer) {
+ analyzers.put(language, analyzer);
+ analyzersList.add(analyzer);
+ }
+
+ /**
+ * Returns a list of all sub-analyzers in this analyzer (including the default one).
+ */
+ public List<Analyzer> getAnalyzers() {
+ return analyzersList;
+ }
+
+ /**
+ * Returns an appropriate analyzer for the given language.
+ *
+ * @param language ISO-639 language identifier
+ */
+ // XXX TODO use java.util.Locale eventually (maybe with Locale#forLanguageTag added in 1.7?)
+ public Analyzer getAnalyzer(String language) {
+ if (language == null) language = "";
+ Analyzer a = analyzers.get(language);
+ if (a == analyzersList.get(0))
+ LOG.warning("Using default analyzer for language " + language);
+ return a;
+ }
+
+}
diff --git a/src/main/java/au/com/miskinhill/search/analysis/PerLanguageAnalyzerWrapper.java b/src/main/java/au/com/miskinhill/search/analysis/PerLanguageAnalyzerWrapper.java
@@ -1,63 +0,0 @@
-package au.com.miskinhill.search.analysis;
-
-import java.io.Reader;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.logging.Logger;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-
-/**
- * In the same vein as
- * {@link org.apache.lucene.analysis.PerFieldAnalyzerWrapper}, this analyzer
- * delegates to a sub-analyzer according to based on the language of the text
- * being analysed. The default sub-analyzer is given in the constructor; this is
- * used when the language is not specified, or when a language is specified for
- * which we have no specific sub-analyzer. Use
- * {@link #addAnalyzer(String, Analyzer)} to add a sub-analyzer for a specific
- * language.
- * <p>
- * Note that languages are matched by prefix, so that if a sub-analyzer has been
- * added for "en" (but not "en-AU"), it will be selected when analysing text
- * whose language is given as "en-AU".
- */
-public class PerLanguageAnalyzerWrapper extends Analyzer {
-
- private static final Logger LOG = Logger.getLogger(PerLanguageAnalyzerWrapper.class.getName());
-
- protected Trie<Analyzer> analyzers;
- private List<Analyzer> analyzersList = new ArrayList<Analyzer>(); // easier than traversing the trie
-
- public PerLanguageAnalyzerWrapper(Analyzer defaultAnalyzer) {
- analyzers = new Trie<Analyzer>(defaultAnalyzer);
- analyzersList.add(defaultAnalyzer);
- }
-
- public void addAnalyzer(String language, Analyzer analyzer) {
- analyzers.put(language, analyzer);
- analyzersList.add(analyzer);
- }
-
- /**
- * Returns a list of all sub-analyzers in this analyzer (including the default one).
- */
- public List<Analyzer> getAnalyzers() {
- return analyzersList;
- }
-
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- LOG.warning("Using default analyzer");
- return tokenStream("", fieldName, reader);
- }
-
- public TokenStream tokenStream(String language, String fieldName, Reader reader) {
- if (language == null) language = "";
- Analyzer a = analyzers.get(language);
- if (a == analyzersList.get(0))
- LOG.warning("Using default analyzer for language " + language);
- return a.tokenStream(fieldName, reader);
- }
-
-}
diff --git a/src/main/java/au/com/miskinhill/search/analysis/XMLTokenizer.java b/src/main/java/au/com/miskinhill/search/analysis/XMLTokenizer.java
@@ -17,123 +17,122 @@ import javax.xml.stream.events.Characters;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
-import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
public class XMLTokenizer extends TokenStream {
-
- private static final XMLInputFactory factory = XMLInputFactory.newInstance();
- static {
- factory.setProperty("javax.xml.stream.isCoalescing", true);
- }
+
+ private static final XMLInputFactory factory = XMLInputFactory.newInstance();
+ static {
+ factory.setProperty("javax.xml.stream.isCoalescing", true);
+ }
public static XMLInputFactory getXMLInputFactory() {
return factory;
}
-
- private static final String XHTML_NS_URI = "http://www.w3.org/1999/xhtml";
-
- private static class LangStack extends Stack<String> {
- private static final long serialVersionUID = 7020093255092191463L;
- private String current = null;
- @Override
- public String push(String item) {
- if (item != null)
- current = item;
- super.push(current);
- return item;
- }
- @Override
- public synchronized String pop() {
- String top = super.pop();
- current = empty() ? null : peek();
- return top;
- }
- public String getCurrent() {
- return current;
- }
- }
-
- private XMLEventReader r;
- private PerLanguageAnalyzerWrapper analyzer;
- private LangStack langs = new LangStack();
-
- /** Current delegate in use (null if none currently) */
- private TokenStream delegate = null;
-
- public XMLTokenizer(Reader reader, PerLanguageAnalyzerWrapper analyzer)
- throws XMLStreamException {
- this.analyzer = analyzer;
- r = factory.createXMLEventReader(reader);
- }
-
- public XMLTokenizer(InputStream in, PerLanguageAnalyzerWrapper analyzer)
- throws XMLStreamException {
- this.analyzer = analyzer;
- r = factory.createXMLEventReader(in);
- }
-
- @Override
- public Token next(Token reusableToken) throws IOException {
- // first try our current string delegate, if we have one
- if (delegate != null) {
- Token retval = delegate.next(reusableToken);
- if (retval != null)
- return retval;
- else
- delegate = null;
- }
-
- while (r.hasNext()) {
- XMLEvent event;
- try {
- event = r.nextEvent();
- } catch (XMLStreamException e) {
- throw new IOException(e);
- }
- switch (event.getEventType()) {
- case XMLStreamConstants.START_ELEMENT:
- StartElement se = event.asStartElement();
- langs.push(getLang(se));
- break;
- case XMLStreamConstants.CHARACTERS:
- Characters chars = event.asCharacters();
- if (chars.isWhiteSpace())
- break; // don't care
- delegate = new OffsetTokenFilter(
- analyzer.tokenStream(langs.getCurrent(),
- null, new StringReader(chars.getData())),
- event.getLocation().getCharacterOffset());
- Token retval = delegate.next(reusableToken);
- if (retval != null)
- return retval;
- else
- delegate = null;
- break;
- case XMLStreamConstants.END_ELEMENT:
- langs.pop();
- break;
- }
- }
- return null;
- }
-
- private String getLang(StartElement se) {
- // xml:lang takes precedence
- QName xmlLangQName = new QName(
- se.getNamespaceURI("") == XMLConstants.XML_NS_URI ? "" : XMLConstants.XML_NS_URI,
- "lang");
- Attribute xmlLang = se.getAttributeByName(xmlLangQName);
- if (xmlLang != null)
- return xmlLang.getValue();
-
- QName xhtmlLangQName = new QName(
- se.getNamespaceURI("") == XHTML_NS_URI ? "" : XHTML_NS_URI,
- "lang");
- Attribute xhtmlLang = se.getAttributeByName(xhtmlLangQName);
- if (xhtmlLang != null)
- return xhtmlLang.getValue();
-
- return null;
- }
-
+
+ private static final String XHTML_NS_URI = "http://www.w3.org/1999/xhtml";
+
+ private static class LangStack extends Stack<String> {
+ private static final long serialVersionUID = 7020093255092191463L;
+ private String current = null;
+
+ public LangStack() {
+ }
+
+ @Override
+ public String push(String item) {
+ if (item != null)
+ current = item;
+ super.push(current);
+ return item;
+ }
+
+ @Override
+ public synchronized String pop() {
+ String top = super.pop();
+ current = empty() ? null : peek();
+ return top;
+ }
+
+ public String getCurrent() {
+ return current;
+ }
+ }
+
+ private XMLEventReader r;
+ private PerLanguageAnalyzerMap analyzerMap;
+ private LangStack langs = new LangStack();
+
+ /** Current delegate in use (null if none currently) */
+ private TokenStream delegate = null;
+
+ public XMLTokenizer(Reader reader, PerLanguageAnalyzerMap analyzerMap) throws XMLStreamException {
+ this.analyzerMap = analyzerMap;
+ r = factory.createXMLEventReader(reader);
+ }
+
+ public XMLTokenizer(InputStream in, PerLanguageAnalyzerMap analyzerMap) throws XMLStreamException {
+ this.analyzerMap = analyzerMap;
+ r = factory.createXMLEventReader(in);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ clearAttributes();
+
+ // first try our current string delegate, if we have one
+ if (delegate != null) {
+ if (delegate.incrementToken())
+ return true;
+ else
+ delegate = null;
+ }
+
+ while (r.hasNext()) {
+ XMLEvent event;
+ try {
+ event = r.nextEvent();
+ } catch (XMLStreamException e) {
+ throw new IOException(e);
+ }
+ switch (event.getEventType()) {
+ case XMLStreamConstants.START_ELEMENT:
+ StartElement se = event.asStartElement();
+ langs.push(getLang(se));
+ break;
+ case XMLStreamConstants.CHARACTERS:
+ Characters chars = event.asCharacters();
+ if (chars.isWhiteSpace())
+ break; // don't care
+ Analyzer analyzer = analyzerMap.getAnalyzer(langs.getCurrent());
+ delegate = new OffsetTokenFilter(analyzer.applyFilters(analyzer.tokenizer(this, new StringReader(
+ chars.getData()))), event.getLocation().getCharacterOffset());
+ if (delegate.incrementToken())
+ return true;
+ else
+ delegate = null;
+ break;
+ case XMLStreamConstants.END_ELEMENT:
+ langs.pop();
+ break;
+ }
+ }
+ return false;
+ }
+
+ private String getLang(StartElement se) {
+ // xml:lang takes precedence
+ QName xmlLangQName = new QName(
+ se.getNamespaceURI("") == XMLConstants.XML_NS_URI ? "" : XMLConstants.XML_NS_URI, "lang");
+ Attribute xmlLang = se.getAttributeByName(xmlLangQName);
+ if (xmlLang != null)
+ return xmlLang.getValue();
+
+ QName xhtmlLangQName = new QName(se.getNamespaceURI("") == XHTML_NS_URI ? "" : XHTML_NS_URI, "lang");
+ Attribute xhtmlLang = se.getAttributeByName(xhtmlLangQName);
+ if (xhtmlLang != null)
+ return xhtmlLang.getValue();
+
+ return null;
+ }
+
}
diff --git a/src/test/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilterUnitTest.java b/src/test/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilterUnitTest.java
@@ -1,19 +1,49 @@
package au.com.miskinhill.search.analysis;
-import static org.junit.Assert.assertThat;
-import static org.hamcrest.CoreMatchers.equalTo;
+import static org.hamcrest.CoreMatchers.*;
+import static org.junit.Assert.*;
import java.io.IOException;
-import java.util.ArrayList;
import java.util.Arrays;
-import java.util.Iterator;
-import java.util.List;
+import java.util.LinkedList;
+import java.util.Queue;
import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;
+import org.apache.lucene.util.AttributeSource;
import org.junit.Test;
public class CyrillicTransliteratingFilterUnitTest {
+
+ private static final class FakeTokenStream extends TokenStream {
+ private final TermAttribute termAttribute;
+ private final OffsetAttribute offsetAttribute;
+ private final PositionIncrementAttribute posIncAttribute;
+ private final Queue<Token> tokens;
+
+ public FakeTokenStream(Token... tokens) {
+ this.tokens = new LinkedList<Token>(Arrays.asList(tokens));
+ this.termAttribute = addAttribute(TermAttribute.class);
+ this.offsetAttribute = addAttribute(OffsetAttribute.class);
+ this.posIncAttribute = addAttribute(PositionIncrementAttribute.class);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if (tokens.isEmpty())
+ return false;
+ clearAttributes();
+ Token next = tokens.remove();
+ termAttribute.setTermBuffer(next.term());
+ offsetAttribute.setOffset(next.startOffset(), next.endOffset());
+ posIncAttribute.setPositionIncrement(next.getPositionIncrement());
+ return true;
+ }
+ }
@Test
public void shouldPassOnTokensWithoutCyrillicUntouched() throws IOException {
@@ -21,8 +51,11 @@ public class CyrillicTransliteratingFilterUnitTest {
asdf.setTermBuffer("asdf");
asdf.setStartOffset(1);
asdf.setEndOffset(4);
- assertThat(filter(Arrays.asList(asdf)),
- equalTo(Arrays.asList(asdf)));
+ TokenFilter filter = new CyrillicTransliteratingFilter(
+ new FakeTokenStream(asdf));
+ assertTrue(filter.incrementToken());
+ assertAttributes(filter, "asdf", 1, 4, 1);
+ assertFalse(filter.incrementToken());
}
@Test
@@ -31,13 +64,13 @@ public class CyrillicTransliteratingFilterUnitTest {
igraCyrillic.setTermBuffer("игра");
igraCyrillic.setStartOffset(1);
igraCyrillic.setEndOffset(4);
- Token igraLatin = new Token();
- igraLatin.setTermBuffer("igra");
- igraLatin.setStartOffset(1);
- igraLatin.setEndOffset(4);
- igraLatin.setPositionIncrement(0);
- assertThat(filter(Arrays.asList(igraCyrillic)),
- equalTo(Arrays.asList(igraCyrillic, igraLatin)));
+ TokenFilter filter = new CyrillicTransliteratingFilter(
+ new FakeTokenStream(igraCyrillic));
+ assertTrue(filter.incrementToken());
+ assertAttributes(filter, "игра", 1, 4, 1);
+ assertTrue(filter.incrementToken());
+ assertAttributes(filter, "igra", 1, 4, 0);
+ assertFalse(filter.incrementToken());
}
@Test
@@ -45,33 +78,26 @@ public class CyrillicTransliteratingFilterUnitTest {
Token mixed = new Token();
mixed.setTermBuffer("interнет");
mixed.setStartOffset(1);
- mixed.setEndOffset(4);
- Token latin = new Token();
- latin.setTermBuffer("internet");
- latin.setStartOffset(1);
- latin.setEndOffset(4);
- latin.setPositionIncrement(0);
- assertThat(filter(Arrays.asList(mixed)),
- equalTo(Arrays.asList(mixed, latin)));
+ mixed.setEndOffset(8);
+ TokenFilter filter = new CyrillicTransliteratingFilter(
+ new FakeTokenStream(mixed));
+ assertTrue(filter.incrementToken());
+ assertAttributes(filter, "interнет", 1, 8, 1);
+ assertTrue(filter.incrementToken());
+ assertAttributes(filter, "internet", 1, 8, 0);
+ assertFalse(filter.incrementToken());
}
- private List<Token> filter(List<Token> input) throws IOException {
- final Iterator<Token> inputIt = input.iterator();
- TokenStream inputStream = new TokenStream() {
- @Override
- public Token next(Token reusableToken) throws IOException {
- if (!inputIt.hasNext()) return null;
- else return inputIt.next();
- }
- };
- CyrillicTransliteratingFilter filter = new CyrillicTransliteratingFilter(inputStream);
- List<Token> output = new ArrayList<Token>();
- while (true) {
- Token next = filter.next(new Token());
- if (next == null) break;
- output.add(next);
- }
- return output;
+ private void assertAttributes(AttributeSource source, String term,
+ int start, int end, int posInc) {
+ assertThat(source.getAttribute(TermAttribute.class).term(),
+ equalTo(term));
+ assertThat(source.getAttribute(OffsetAttribute.class).startOffset(),
+ equalTo(start));
+ assertThat(source.getAttribute(OffsetAttribute.class).endOffset(),
+ equalTo(end));
+ assertThat(source.getAttribute(PositionIncrementAttribute.class)
+ .getPositionIncrement(), equalTo(posInc));
}
}
diff --git a/src/test/java/au/com/miskinhill/search/analysis/PerLanguageAnalyzerWrapperUnitTest.java b/src/test/java/au/com/miskinhill/search/analysis/PerLanguageAnalyzerWrapperUnitTest.java
@@ -1,13 +1,12 @@
package au.com.miskinhill.search.analysis;
-import static org.easymock.classextension.EasyMock.*;
-import static org.junit.Assert.assertThat;
-import static org.junit.matchers.JUnitMatchers.hasItems;
+import static org.easymock.EasyMock.*;
+import static org.hamcrest.CoreMatchers.*;
+import static org.junit.Assert.*;
-import java.io.Reader;
-import java.io.StringReader;
+import java.util.Arrays;
-import org.apache.lucene.analysis.Analyzer;
+import org.junit.Before;
import org.junit.Test;
public class PerLanguageAnalyzerWrapperUnitTest {
@@ -15,67 +14,34 @@ public class PerLanguageAnalyzerWrapperUnitTest {
private Analyzer defaultAnalyzer = createMock(Analyzer.class);
private Analyzer enAnalyzer = createMock(Analyzer.class);
private Analyzer ruAnalyzer = createMock(Analyzer.class);
+ private PerLanguageAnalyzerMap plam;
+
+ @Before
+ public void setUp() {
+ plam = new PerLanguageAnalyzerMap(defaultAnalyzer);
+ plam.addAnalyzer("en", enAnalyzer);
+ plam.addAnalyzer("ru", ruAnalyzer);
+ }
@Test
public void testGetAnalyzers() {
- PerLanguageAnalyzerWrapper plaw =
- new PerLanguageAnalyzerWrapper(defaultAnalyzer);
- plaw.addAnalyzer("en", enAnalyzer);
- plaw.addAnalyzer("ru", ruAnalyzer);
- assertThat(plaw.getAnalyzers(),
- hasItems(defaultAnalyzer, enAnalyzer, ruAnalyzer));
- }
-
- @Test
- public void testTokenStreamNoLanguage() {
- expect(defaultAnalyzer.tokenStream(
- isA(String.class), isA(Reader.class))).andReturn(null);
- replay(defaultAnalyzer, enAnalyzer, ruAnalyzer);
- PerLanguageAnalyzerWrapper plaw =
- new PerLanguageAnalyzerWrapper(defaultAnalyzer);
- plaw.addAnalyzer("en", enAnalyzer);
- plaw.addAnalyzer("ru", ruAnalyzer);
- plaw.tokenStream("asdf", new StringReader(""));
- verify();
+ assertThat(plam.getAnalyzers(),
+ equalTo(Arrays.asList(defaultAnalyzer, enAnalyzer, ruAnalyzer)));
}
@Test
public void testTokenStreamEmptyLanguage() {
- expect(defaultAnalyzer.tokenStream(
- isA(String.class), isA(Reader.class))).andReturn(null);
- replay(defaultAnalyzer, enAnalyzer, ruAnalyzer);
- PerLanguageAnalyzerWrapper plaw =
- new PerLanguageAnalyzerWrapper(defaultAnalyzer);
- plaw.addAnalyzer("en", enAnalyzer);
- plaw.addAnalyzer("ru", ruAnalyzer);
- plaw.tokenStream("", "asdf", new StringReader(""));
- verify();
+ assertThat(plam.getAnalyzer(""), equalTo(defaultAnalyzer));
}
@Test
public void testTokenStreamNullLanguage() {
- expect(defaultAnalyzer.tokenStream(
- isA(String.class), isA(Reader.class))).andReturn(null);
- replay(defaultAnalyzer, enAnalyzer, ruAnalyzer);
- PerLanguageAnalyzerWrapper plaw =
- new PerLanguageAnalyzerWrapper(defaultAnalyzer);
- plaw.addAnalyzer("en", enAnalyzer);
- plaw.addAnalyzer("ru", ruAnalyzer);
- plaw.tokenStream(null, "asdf", new StringReader(""));
- verify();
+ assertThat(plam.getAnalyzer(null), equalTo(defaultAnalyzer));
}
@Test
public void testTokenStreamSomeLanguage() {
- expect(enAnalyzer.tokenStream(
- isA(String.class), isA(Reader.class))).andReturn(null);
- replay(defaultAnalyzer, enAnalyzer, ruAnalyzer);
- PerLanguageAnalyzerWrapper plaw =
- new PerLanguageAnalyzerWrapper(defaultAnalyzer);
- plaw.addAnalyzer("en", enAnalyzer);
- plaw.addAnalyzer("ru", ruAnalyzer);
- plaw.tokenStream("en", "asdf", new StringReader(""));
- verify();
+ assertThat(plam.getAnalyzer("en"), equalTo(enAnalyzer));
}
}