commit ebc7e7059f3dd5e720925d904b70f1009b02bb33
Author: Dan Callaghan <djc@djc.id.au>
Date: Wed, 31 Dec 2008 13:00:17 +1000
moving lucene stuff into its own module
--HG--
extra : convert_revision : 95c31a0efdf511984357b2cc7b06a35615a9ab6c
Diffstat:
4 files changed, 201 insertions(+), 0 deletions(-)
diff --git a/pom.xml b/pom.xml
@@ -0,0 +1,28 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <groupId>au.com.miskinhill.search</groupId>
+ <artifactId>multilingual</artifactId>
+ <packaging>jar</packaging>
+ <version>1.0-SNAPSHOT</version>
+ <name>multilingual</name>
+ <description>Lucene classes for working with text in multiple languages</description>
+ <url>http://code.miskinhill.com.au/search/multilingual/</url>
+ <organization>
+ <name>Miskin Hill</name>
+ <url>http://miskinhill.com.au/</url>
+ </organization>
+ <dependencies>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <version>4.5</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.lucene</groupId>
+ <artifactId>lucene-core</artifactId>
+ <version>2.4.0</version>
+ </dependency>
+ </dependencies>
+</project>
diff --git a/src/main/java/au/com/miskinhill/search/analysis/PerLanguageAnalyzerWrapper.java b/src/main/java/au/com/miskinhill/search/analysis/PerLanguageAnalyzerWrapper.java
@@ -0,0 +1,43 @@
+package au.com.miskinhill.search.analysis;
+
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * In the same vein as
+ * {@link org.apache.lucene.analysis.PerFieldAnalyzerWrapper}, this analyzer
+ * delegates to a sub-analyzer according to based on the language of the text
+ * being analysed. The default sub-analyzer is given in the constructor; this is
+ * used when the language is not specified, or when a language is specified for
+ * which we have no specific sub-analyzer. Use
+ * {@link #addAnalyzer(String, Analyzer)} to add a sub-analyzer for a specific
+ * language.
+ * <p>
+ * Note that languages are matched by prefix, so that if a sub-analyzer has been
+ * added for "en" (but not "en-AU"), it will be selected when analysing text
+ * whose language is given as "en-AU".
+ */
+public class PerLanguageAnalyzerWrapper extends Analyzer {
+
+ protected Trie<Analyzer> analyzers;
+
+ public PerLanguageAnalyzerWrapper(Analyzer defaultAnalyzer) {
+ analyzers = new Trie<Analyzer>(defaultAnalyzer);
+ }
+
+ public void addAnalyzer(String language, Analyzer analyzer) {
+ analyzers.put(language, analyzer);
+ }
+
+ public TokenStream tokenStream(String fieldName, Reader reader) {
+ return tokenStream("", fieldName, reader);
+ }
+
+ public TokenStream tokenStream(String language, String fieldName, Reader reader) {
+ Analyzer a = analyzers.get(language);
+ return a.tokenStream(fieldName, reader);
+ }
+
+}
diff --git a/src/main/java/au/com/miskinhill/search/analysis/Trie.java b/src/main/java/au/com/miskinhill/search/analysis/Trie.java
@@ -0,0 +1,70 @@
+package au.com.miskinhill.search.analysis;
+
+import java.util.HashMap;
+import java.util.Map;
+
+// TODO move this into its own/a common utilities module?
+public class Trie<T> {
+
+ private TrieNode root;
+
+ /**
+ * Creates a new trie with the given value associated with the empty string
+ * (i.e. it will be returned by default, if no longer matching prefix is
+ * found).
+ */
+ public Trie(T rootValue) {
+ root = new TrieNode(rootValue);
+ }
+
+ /**
+ * Associates the given value with the given key.
+ */
+ public void put(CharSequence key, T value) {
+ if (value == null)
+ throw new IllegalArgumentException("null values cannot be stored");
+ int i = 0;
+ TrieNode curr = root;
+ while (i < key.length()) {
+ TrieNode child = curr.children.get(key.charAt(i));
+ if (child == null) {
+ TrieNode new_ = new TrieNode(null);
+ curr.children.put(key.charAt(i), new_);
+ curr = new_;
+ } else {
+ curr = child;
+ }
+ i ++;
+ }
+ curr.value = value;
+ }
+
+ /**
+ * Returns the value associated with the longest prefix match for the given
+ * key.
+ */
+ public T get(CharSequence key) {
+ int i = 0;
+ TrieNode curr = root;
+ T retval = root.value;
+ while (i < key.length()) {
+ curr = curr.children.get(key.charAt(i));
+ if (curr == null) {
+ return retval;
+ } else if (curr.value != null) {
+ retval = curr.value;
+ }
+ i ++;
+ }
+ return retval;
+ }
+
+ private class TrieNode {
+ public T value;
+ public Map<Character, TrieNode> children = new HashMap<Character, TrieNode>();
+ public TrieNode(T value) {
+ this.value = value;
+ }
+ }
+
+}
diff --git a/src/test/java/au/com/miskinhill/search/analysis/TrieUnitTest.java b/src/test/java/au/com/miskinhill/search/analysis/TrieUnitTest.java
@@ -0,0 +1,60 @@
+package au.com.miskinhill.search.analysis;
+
+import static org.junit.Assert.*;
+
+import org.junit.Test;
+
+import au.com.miskinhill.search.analysis.Trie;
+
+public class TrieUnitTest {
+
+ @Test
+ public void testEmptyTrie() {
+ Trie<String> t = new Trie<String>("asdf");
+ assertEquals("asdf", t.get(""));
+ assertEquals("asdf", t.get("somekey"));
+ }
+
+ @Test
+ public void testNoPrefix() {
+ Trie<String> t = new Trie<String>("root");
+ t.put("en", "English");
+ t.put("de", "Deutsch");
+ assertEquals("root", t.get("pl"));
+ }
+
+ @Test
+ public void testPrefixButNoMatch() {
+ Trie<String> t = new Trie<String>("root");
+ t.put("en", "English");
+ t.put("de", "Deutsch");
+ assertEquals("root", t.get("es"));
+ }
+
+ @Test
+ public void testPrefixMatch() {
+ Trie<String> t = new Trie<String>("root");
+ t.put("en", "English");
+ t.put("de", "Deutsch");
+ assertEquals("English", t.get("en-AU"));
+ }
+
+ @Test
+ public void testExactMatch() {
+ Trie<String> t = new Trie<String>("root");
+ t.put("en", "English");
+ t.put("en-AU", "Australian");
+ t.put("de", "Deutsch");
+ assertEquals("Australian", t.get("en-AU"));
+ }
+
+ @Test
+ public void testDifferentPrefixMatch() {
+ Trie<String> t = new Trie<String>("root");
+ t.put("en", "English");
+ t.put("en-AU", "Australian");
+ t.put("de", "Deutsch");
+ assertEquals("English", t.get("en-GB"));
+ }
+
+}