commit 8671f7e1f417bc7aa9c07bf6080007e141395422
parent 1e92de367fae7c2c9ad5025b38163f42662d048d
Author: Dan Callaghan <djc@djc.id.au>
Date: Mon, 18 May 2009 20:56:53 +1000
rudimentary Cyrillic transliteration support
Diffstat:
2 files changed, 172 insertions(+), 0 deletions(-)
diff --git a/src/main/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilter.java b/src/main/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilter.java
@@ -0,0 +1,95 @@
+package au.com.miskinhill.search.analysis;
+
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Assumes that tokens have already been lower-cased.
+ */
+public class CyrillicTransliteratingFilter extends TokenFilter {
+
+ private static final String CYRILLIC_PATTERN = ".*[а-я]+.*";
+
+ private Token transliterated = null;
+
+ protected CyrillicTransliteratingFilter(TokenStream input) {
+ super(input);
+ }
+
+ @Override
+ public Token next(Token reusableToken) throws IOException {
+ Token tok;
+ if (transliterated == null) {
+ tok = input.next(reusableToken);
+ if (tok == null) return null;
+ if (needsTransliterating(tok.term())) {
+ transliterated = (Token) tok.clone();
+ transliterated.setTermBuffer(transliterate(transliterated.term()));
+ transliterated.setPositionIncrement(0);
+ }
+ } else {
+ tok = transliterated;
+ transliterated = null;
+ }
+ return tok;
+ }
+
+ private static boolean needsTransliterating(String text) {
+ return (text.matches(CYRILLIC_PATTERN));
+ }
+
+ private static final Map<Character, String> TRANSLITERATION_TABLE = new HashMap<Character, String>();
+ static {
+ TRANSLITERATION_TABLE.put('а', "a");
+ TRANSLITERATION_TABLE.put('б', "b");
+ TRANSLITERATION_TABLE.put('в', "v");
+ TRANSLITERATION_TABLE.put('г', "g");
+ TRANSLITERATION_TABLE.put('д', "d");
+ TRANSLITERATION_TABLE.put('е', "e");
+ TRANSLITERATION_TABLE.put('ё', "e");
+ TRANSLITERATION_TABLE.put('ж', "zh");
+ TRANSLITERATION_TABLE.put('з', "z");
+ TRANSLITERATION_TABLE.put('и', "i");
+ TRANSLITERATION_TABLE.put('й', "y");
+ TRANSLITERATION_TABLE.put('к', "k");
+ TRANSLITERATION_TABLE.put('л', "l");
+ TRANSLITERATION_TABLE.put('м', "m");
+ TRANSLITERATION_TABLE.put('н', "n");
+ TRANSLITERATION_TABLE.put('о', "o");
+ TRANSLITERATION_TABLE.put('п', "p");
+ TRANSLITERATION_TABLE.put('р', "r");
+ TRANSLITERATION_TABLE.put('с', "s");
+ TRANSLITERATION_TABLE.put('т', "t");
+ TRANSLITERATION_TABLE.put('у', "u");
+ TRANSLITERATION_TABLE.put('ф', "f");
+ TRANSLITERATION_TABLE.put('х', "kh");
+ TRANSLITERATION_TABLE.put('ц', "ts");
+ TRANSLITERATION_TABLE.put('ч', "ch");
+ TRANSLITERATION_TABLE.put('ш', "sh");
+ TRANSLITERATION_TABLE.put('щ', "shch");
+ TRANSLITERATION_TABLE.put('ъ', "'");
+ TRANSLITERATION_TABLE.put('ы', "y");
+ TRANSLITERATION_TABLE.put('ь', "'");
+ TRANSLITERATION_TABLE.put('э', "e");
+ TRANSLITERATION_TABLE.put('ю', "iu");
+ TRANSLITERATION_TABLE.put('я', "ia");
+ }
+
+ private static String transliterate(CharSequence cyrillic) {
+ StringBuilder transliterated = new StringBuilder();
+ for (int i = 0; i < cyrillic.length(); i ++) {
+ Character c = cyrillic.charAt(i);
+ if (TRANSLITERATION_TABLE.containsKey(c))
+ transliterated.append(TRANSLITERATION_TABLE.get(c));
+ else
+ transliterated.append(c);
+ }
+ return transliterated.toString();
+ }
+
+}
diff --git a/src/test/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilterUnitTest.java b/src/test/java/au/com/miskinhill/search/analysis/CyrillicTransliteratingFilterUnitTest.java
@@ -0,0 +1,77 @@
+package au.com.miskinhill.search.analysis;
+
+import static org.junit.Assert.assertThat;
+import static org.hamcrest.CoreMatchers.equalTo;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.junit.Test;
+
+public class CyrillicTransliteratingFilterUnitTest {
+
+ @Test
+ public void shouldPassOnTokensWithoutCyrillicUntouched() throws IOException {
+ Token asdf = new Token();
+ asdf.setTermBuffer("asdf");
+ asdf.setStartOffset(1);
+ asdf.setEndOffset(4);
+ assertThat(filter(Arrays.asList(asdf)),
+ equalTo(Arrays.asList(asdf)));
+ }
+
+ @Test
+ public void shouldTransliterateCyrillicTokens() throws IOException {
+ Token igraCyrillic = new Token();
+ igraCyrillic.setTermBuffer("игра");
+ igraCyrillic.setStartOffset(1);
+ igraCyrillic.setEndOffset(4);
+ Token igraLatin = new Token();
+ igraLatin.setTermBuffer("igra");
+ igraLatin.setStartOffset(1);
+ igraLatin.setEndOffset(4);
+ igraLatin.setPositionIncrement(0);
+ assertThat(filter(Arrays.asList(igraCyrillic)),
+ equalTo(Arrays.asList(igraCyrillic, igraLatin)));
+ }
+
+ @Test
+ public void shouldTransliterateTokensWithMixedLatinAndCyrillic() throws IOException {
+ Token mixed = new Token();
+ mixed.setTermBuffer("interнет");
+ mixed.setStartOffset(1);
+ mixed.setEndOffset(4);
+ Token latin = new Token();
+ latin.setTermBuffer("internet");
+ latin.setStartOffset(1);
+ latin.setEndOffset(4);
+ latin.setPositionIncrement(0);
+ assertThat(filter(Arrays.asList(mixed)),
+ equalTo(Arrays.asList(mixed, latin)));
+ }
+
+ private List<Token> filter(List<Token> input) throws IOException {
+ final Iterator<Token> inputIt = input.iterator();
+ TokenStream inputStream = new TokenStream() {
+ @Override
+ public Token next(Token reusableToken) throws IOException {
+ if (!inputIt.hasNext()) return null;
+ else return inputIt.next();
+ }
+ };
+ CyrillicTransliteratingFilter filter = new CyrillicTransliteratingFilter(inputStream);
+ List<Token> output = new ArrayList<Token>();
+ while (true) {
+ Token next = filter.next(new Token());
+ if (next == null) break;
+ output.add(next);
+ }
+ return output;
+ }
+
+}