lucene-multilingual

Multilingual enhancements for the Lucene text search library
git clone https://code.djc.id.au/git/lucene-multilingual/
commit 1e92de367fae7c2c9ad5025b38163f42662d048d
parent 8112f747c63524b334a5069d8c5b4c07d9687dd0
Author: Dan Callaghan <djc@djc.id.au>
Date:   Thu,  1 Jan 2009 00:29:28 +1000

warn to stderr when we hit an unknown language (for sanity checking)

--HG--
extra : convert_revision : ad32c5fb6e00a31126c2162af5f22a16e296d9e8

Diffstat:
Msrc/main/java/au/com/miskinhill/search/analysis/PerLanguageAnalyzerWrapper.java | 4++++
Msrc/main/java/au/com/miskinhill/search/analysis/XMLTokenizer.java | 2++
2 files changed, 6 insertions(+), 0 deletions(-)
diff --git a/src/main/java/au/com/miskinhill/search/analysis/PerLanguageAnalyzerWrapper.java b/src/main/java/au/com/miskinhill/search/analysis/PerLanguageAnalyzerWrapper.java
@@ -45,12 +45,16 @@ public class PerLanguageAnalyzerWrapper extends Analyzer {
 
 	@Override
 	public TokenStream tokenStream(String fieldName, Reader reader) {
+		System.err.println("WARNING: " + this + " using default analyzer");
 		return tokenStream("", fieldName, reader);
 	}
 	
 	public TokenStream tokenStream(String language, String fieldName, Reader reader) {
 		if (language == null) language = "";
 		Analyzer a = analyzers.get(language);
+		if (a == analyzersList.get(0))
+			System.err.println("WARNING: " + this + 
+					" using default analyzer for language " + language);
 		return a.tokenStream(fieldName, reader);
 	}
 
diff --git a/src/main/java/au/com/miskinhill/search/analysis/XMLTokenizer.java b/src/main/java/au/com/miskinhill/search/analysis/XMLTokenizer.java
@@ -94,6 +94,8 @@ public class XMLTokenizer extends TokenStream {
 					break;
 				case XMLStreamConstants.CHARACTERS:
 					Characters chars = event.asCharacters();
+					if (chars.isWhiteSpace())
+						break; // don't care
 					delegate = new OffsetTokenFilter(
 							analyzer.tokenStream(langs.getCurrent(), 
 									null, new StringReader(chars.getData())),