lucene-multilingual

Multilingual enhancements for the Lucene text search library
git clone https://code.djc.id.au/git/lucene-multilingual/
commit 044f35a6abdd370d353afd20b26b91f9697357ab
parent ad54c04938acef563f3ee49130055cc3a6485660
Author: Dan Callaghan <djc@djc.id.au>
Date:   Wed, 25 Apr 2012 13:51:59 +1000

updated Lucene and other deps

Diffstat:
Mpom.xml | 9+++++----
Msrc/main/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilter.java | 18++++++++----------
Msrc/main/java/au/com/miskinhill/search/analysis/OffsetTokenFilter.java | 2+-
Msrc/main/java/au/com/miskinhill/search/analysis/XMLTokenizer.java | 2+-
Msrc/test/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilterUnitTest.java | 19++++++++++---------
5 files changed, 25 insertions(+), 25 deletions(-)
diff --git a/pom.xml b/pom.xml
@@ -29,7 +29,7 @@
             <extension>
                 <groupId>org.apache.maven.wagon</groupId>
                 <artifactId>wagon-webdav-jackrabbit</artifactId>
-                <version>1.0-beta-7</version>
+                <version>2.2</version>
             </extension>
         </extensions>
         <pluginManagement>
@@ -72,18 +72,19 @@
   	<dependency>
   		<groupId>junit</groupId>
   		<artifactId>junit</artifactId>
-        <version>4.8.2</version>
+        <version>4.10</version>
   		<scope>test</scope>
   	</dependency>
   	<dependency>
   		<groupId>org.apache.lucene</groupId>
   		<artifactId>lucene-core</artifactId>
-        <version>3.0.2</version>
+        <version>3.6.0</version>
   	</dependency>
   	<dependency>
   		<groupId>org.easymock</groupId>
   		<artifactId>easymock</artifactId>
-        <version>2.5.2</version>
+        <version>3.1</version>
+        <scope>test</scope>
   	</dependency>
   </dependencies>
 </project>
diff --git a/src/main/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilter.java b/src/main/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilter.java
@@ -1,15 +1,14 @@
 package au.com.miskinhill.search.analysis;
 
 import java.io.IOException;
-import java.nio.CharBuffer;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.regex.Pattern;
 
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 
 /**
  * Assumes that tokens have already been lower-cased.
@@ -18,31 +17,30 @@ public class CyrillicTransliteratingFilter extends TokenFilter {
     
     private static final Pattern CYRILLIC_PATTERN = Pattern.compile("[а-я]+");
 
-    private final TermAttribute termAttribute;
+    private final CharTermAttribute termAttribute;
     private final PositionIncrementAttribute posIncAttribute;
     private String transliterated = null;
     private State transliteratedState = null;
     
     protected CyrillicTransliteratingFilter(TokenStream input) {
         super(input);
-        this.termAttribute = addAttribute(TermAttribute.class);
+        this.termAttribute = addAttribute(CharTermAttribute.class);
         this.posIncAttribute = addAttribute(PositionIncrementAttribute.class);
     }
     
     @Override
-    public boolean incrementToken() throws IOException {
+    public final boolean incrementToken() throws IOException {
         if (transliterated == null) {
             if (!input.incrementToken())
                 return false;
-            CharSequence text = CharBuffer.wrap(termAttribute.termBuffer(),
-                    0, termAttribute.termLength());
-            if (needsTransliterating(text)) {
-                transliterated = transliterate(text);
+            if (needsTransliterating(termAttribute)) {
+                transliterated = transliterate(termAttribute);
                 transliteratedState = captureState();
             }
         } else {
             restoreState(transliteratedState);
-            termAttribute.setTermBuffer(transliterated);
+            termAttribute.setEmpty();
+            termAttribute.append(transliterated);
             posIncAttribute.setPositionIncrement(0);
             transliterated = null;
             transliteratedState = null;
diff --git a/src/main/java/au/com/miskinhill/search/analysis/OffsetTokenFilter.java b/src/main/java/au/com/miskinhill/search/analysis/OffsetTokenFilter.java
@@ -18,7 +18,7 @@ public class OffsetTokenFilter extends TokenFilter {
 	}
 	
     @Override
-    public boolean incrementToken() throws IOException {
+    public final boolean incrementToken() throws IOException {
         if (input.incrementToken()) {
             if (offset != 0) {
                 offsetAttribute.setOffset(offsetAttribute.startOffset() + offset,
diff --git a/src/main/java/au/com/miskinhill/search/analysis/XMLTokenizer.java b/src/main/java/au/com/miskinhill/search/analysis/XMLTokenizer.java
@@ -76,7 +76,7 @@ public class XMLTokenizer extends TokenStream {
     }
 
     @Override
-    public boolean incrementToken() throws IOException {
+    public final boolean incrementToken() throws IOException {
         clearAttributes();
 
         // first try our current string delegate, if we have one
diff --git a/src/test/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilterUnitTest.java b/src/test/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilterUnitTest.java
@@ -11,34 +11,35 @@ import java.util.Queue;
 import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.util.AttributeSource;
 import org.junit.Test;
 
 public class CyrillicTransliteratingFilterUnitTest {
     
     private static final class FakeTokenStream extends TokenStream {
-        private final TermAttribute termAttribute;
+        private final CharTermAttribute termAttribute;
         private final OffsetAttribute offsetAttribute;
         private final PositionIncrementAttribute posIncAttribute;
         private final Queue<Token> tokens;
         
         public FakeTokenStream(Token... tokens) {
             this.tokens = new LinkedList<Token>(Arrays.asList(tokens));
-            this.termAttribute = addAttribute(TermAttribute.class);
+            this.termAttribute = addAttribute(CharTermAttribute.class);
             this.offsetAttribute = addAttribute(OffsetAttribute.class);
             this.posIncAttribute = addAttribute(PositionIncrementAttribute.class);
         }
 
         @Override
-        public boolean incrementToken() throws IOException {
+        public final boolean incrementToken() throws IOException {
             if (tokens.isEmpty())
                 return false;
             clearAttributes();
             Token next = tokens.remove();
-            termAttribute.setTermBuffer(next.term());
+            termAttribute.setEmpty();
+            termAttribute.append(next);
             offsetAttribute.setOffset(next.startOffset(), next.endOffset());
             posIncAttribute.setPositionIncrement(next.getPositionIncrement());
             return true;
@@ -48,7 +49,7 @@ public class CyrillicTransliteratingFilterUnitTest {
     @Test
     public void shouldPassOnTokensWithoutCyrillicUntouched() throws IOException {
         Token asdf = new Token();
-        asdf.setTermBuffer("asdf");
+        asdf.append("asdf");
         asdf.setStartOffset(1);
         asdf.setEndOffset(4);
         TokenFilter filter = new CyrillicTransliteratingFilter(
@@ -61,7 +62,7 @@ public class CyrillicTransliteratingFilterUnitTest {
     @Test
     public void shouldTransliterateCyrillicTokens() throws IOException {
         Token igraCyrillic = new Token();
-        igraCyrillic.setTermBuffer("игра");
+        igraCyrillic.append("игра");
         igraCyrillic.setStartOffset(1);
         igraCyrillic.setEndOffset(4);
         TokenFilter filter = new CyrillicTransliteratingFilter(
@@ -76,7 +77,7 @@ public class CyrillicTransliteratingFilterUnitTest {
     @Test
     public void shouldTransliterateTokensWithMixedLatinAndCyrillic() throws IOException {
         Token mixed = new Token();
-        mixed.setTermBuffer("interнет");
+        mixed.append("interнет");
         mixed.setStartOffset(1);
         mixed.setEndOffset(8);
         TokenFilter filter = new CyrillicTransliteratingFilter(
@@ -90,7 +91,7 @@ public class CyrillicTransliteratingFilterUnitTest {
     
     private void assertAttributes(AttributeSource source, String term,
             int start, int end, int posInc) {
-        assertThat(source.getAttribute(TermAttribute.class).term(),
+        assertThat(source.getAttribute(CharTermAttribute.class).toString(),
                 equalTo(term));
         assertThat(source.getAttribute(OffsetAttribute.class).startOffset(),
                 equalTo(start));