lucene-multilingual

Multilingual enhancements for the Lucene text search library
git clone https://code.djc.id.au/git/lucene-multilingual/
commit 8671f7e1f417bc7aa9c07bf6080007e141395422
parent 1e92de367fae7c2c9ad5025b38163f42662d048d
Author: Dan Callaghan <djc@djc.id.au>
Date:   Mon, 18 May 2009 20:56:53 +1000

rudimentary Cyrillic transliteration support

Diffstat:
Asrc/main/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilter.java | 95+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/test/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilterUnitTest.java | 77+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 172 insertions(+), 0 deletions(-)
diff --git a/src/main/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilter.java b/src/main/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilter.java
@@ -0,0 +1,95 @@
+package au.com.miskinhill.search.analysis;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Assumes that tokens have already been lower-cased.
+ */
+public class CyrillicTransliteratingFilter extends TokenFilter {
+    
+    private static final String CYRILLIC_PATTERN = ".*[а-я]+.*";
+    
+    private Token transliterated = null;
+    
+    protected CyrillicTransliteratingFilter(TokenStream input) {
+        super(input);
+    }
+    
+    @Override
+    public Token next(Token reusableToken) throws IOException {
+        Token tok;
+        if (transliterated == null) {
+            tok = input.next(reusableToken);
+            if (tok == null) return null;
+            if (needsTransliterating(tok.term())) {
+                transliterated = (Token) tok.clone();
+                transliterated.setTermBuffer(transliterate(transliterated.term()));
+                transliterated.setPositionIncrement(0);
+            }
+        } else {
+            tok = transliterated;
+            transliterated = null;
+        }
+        return tok;
+    }
+    
+    private static boolean needsTransliterating(String text) {
+        return (text.matches(CYRILLIC_PATTERN));
+    }
+    
+    private static final Map<Character, String> TRANSLITERATION_TABLE = new HashMap<Character, String>();
+    static {
+        TRANSLITERATION_TABLE.put('а', "a");
+        TRANSLITERATION_TABLE.put('б', "b");
+        TRANSLITERATION_TABLE.put('в', "v");
+        TRANSLITERATION_TABLE.put('г', "g");
+        TRANSLITERATION_TABLE.put('д', "d");
+        TRANSLITERATION_TABLE.put('е', "e");
+        TRANSLITERATION_TABLE.put('ё', "e");
+        TRANSLITERATION_TABLE.put('ж', "zh");
+        TRANSLITERATION_TABLE.put('з', "z");
+        TRANSLITERATION_TABLE.put('и', "i");
+        TRANSLITERATION_TABLE.put('й', "y");
+        TRANSLITERATION_TABLE.put('к', "k");
+        TRANSLITERATION_TABLE.put('л', "l");
+        TRANSLITERATION_TABLE.put('м', "m");
+        TRANSLITERATION_TABLE.put('н', "n");
+        TRANSLITERATION_TABLE.put('о', "o");
+        TRANSLITERATION_TABLE.put('п', "p");
+        TRANSLITERATION_TABLE.put('р', "r");
+        TRANSLITERATION_TABLE.put('с', "s");
+        TRANSLITERATION_TABLE.put('т', "t");
+        TRANSLITERATION_TABLE.put('у', "u");
+        TRANSLITERATION_TABLE.put('ф', "f");
+        TRANSLITERATION_TABLE.put('х', "kh");
+        TRANSLITERATION_TABLE.put('ц', "ts");
+        TRANSLITERATION_TABLE.put('ч', "ch");
+        TRANSLITERATION_TABLE.put('ш', "sh");
+        TRANSLITERATION_TABLE.put('щ', "shch");
+        TRANSLITERATION_TABLE.put('ъ', "'");
+        TRANSLITERATION_TABLE.put('ы', "y");
+        TRANSLITERATION_TABLE.put('ь', "'");
+        TRANSLITERATION_TABLE.put('э', "e");
+        TRANSLITERATION_TABLE.put('ю', "iu");
+        TRANSLITERATION_TABLE.put('я', "ia");
+    }
+    
+    private static String transliterate(CharSequence cyrillic) {
+        StringBuilder transliterated = new StringBuilder();
+        for (int i = 0; i < cyrillic.length(); i ++) {
+            Character c = cyrillic.charAt(i);
+            if (TRANSLITERATION_TABLE.containsKey(c))
+                transliterated.append(TRANSLITERATION_TABLE.get(c));
+            else 
+                transliterated.append(c);
+        }
+        return transliterated.toString();
+    }
+
+}
diff --git a/src/test/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilterUnitTest.java b/src/test/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilterUnitTest.java
@@ -0,0 +1,77 @@
+package au.com.miskinhill.search.analysis;
+
+import static org.junit.Assert.assertThat;
+import static org.hamcrest.CoreMatchers.equalTo;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.junit.Test;
+
+public class CyrillicTransliteratingFilterUnitTest {
+
+    @Test
+    public void shouldPassOnTokensWithoutCyrillicUntouched() throws IOException {
+        Token asdf = new Token();
+        asdf.setTermBuffer("asdf");
+        asdf.setStartOffset(1);
+        asdf.setEndOffset(4);
+        assertThat(filter(Arrays.asList(asdf)),
+                equalTo(Arrays.asList(asdf)));
+    }
+    
+    @Test
+    public void shouldTransliterateCyrillicTokens() throws IOException {
+        Token igraCyrillic = new Token();
+        igraCyrillic.setTermBuffer("игра");
+        igraCyrillic.setStartOffset(1);
+        igraCyrillic.setEndOffset(4);
+        Token igraLatin = new Token();
+        igraLatin.setTermBuffer("igra");
+        igraLatin.setStartOffset(1);
+        igraLatin.setEndOffset(4);
+        igraLatin.setPositionIncrement(0);
+        assertThat(filter(Arrays.asList(igraCyrillic)),
+                equalTo(Arrays.asList(igraCyrillic, igraLatin)));
+    }
+    
+    @Test
+    public void shouldTransliterateTokensWithMixedLatinAndCyrillic() throws IOException {
+        Token mixed = new Token();
+        mixed.setTermBuffer("interнет");
+        mixed.setStartOffset(1);
+        mixed.setEndOffset(4);
+        Token latin = new Token();
+        latin.setTermBuffer("internet");
+        latin.setStartOffset(1);
+        latin.setEndOffset(4);
+        latin.setPositionIncrement(0);
+        assertThat(filter(Arrays.asList(mixed)),
+                equalTo(Arrays.asList(mixed, latin)));
+    }
+    
+    private List<Token> filter(List<Token> input) throws IOException {
+        final Iterator<Token> inputIt = input.iterator();
+        TokenStream inputStream = new TokenStream() {
+            @Override
+            public Token next(Token reusableToken) throws IOException {
+                if (!inputIt.hasNext()) return null;
+                else return inputIt.next();
+            }
+        };
+        CyrillicTransliteratingFilter filter = new CyrillicTransliteratingFilter(inputStream);
+        List<Token> output = new ArrayList<Token>();
+        while (true) {
+            Token next = filter.next(new Token());
+            if (next == null) break;
+            output.add(next);
+        }
+        return output;
+    }
+
+}