commit 1e92de367fae7c2c9ad5025b38163f42662d048d
parent 8112f747c63524b334a5069d8c5b4c07d9687dd0
Author: Dan Callaghan <djc@djc.id.au>
Date: Thu, 1 Jan 2009 00:29:28 +1000
warn to stderr when we hit an unknown language (for sanity checking)
--HG--
extra : convert_revision : ad32c5fb6e00a31126c2162af5f22a16e296d9e8
Diffstat:
2 files changed, 6 insertions(+), 0 deletions(-)
diff --git a/src/main/java/au/com/miskinhill/search/analysis/PerLanguageAnalyzerWrapper.java b/src/main/java/au/com/miskinhill/search/analysis/PerLanguageAnalyzerWrapper.java
@@ -45,12 +45,16 @@ public class PerLanguageAnalyzerWrapper extends Analyzer {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
+ System.err.println("WARNING: " + this + " using default analyzer");
return tokenStream("", fieldName, reader);
}
public TokenStream tokenStream(String language, String fieldName, Reader reader) {
if (language == null) language = "";
Analyzer a = analyzers.get(language);
+ if (a == analyzersList.get(0))
+ System.err.println("WARNING: " + this +
+ " using default analyzer for language " + language);
return a.tokenStream(fieldName, reader);
}
diff --git a/src/main/java/au/com/miskinhill/search/analysis/XMLTokenizer.java b/src/main/java/au/com/miskinhill/search/analysis/XMLTokenizer.java
@@ -94,6 +94,8 @@ public class XMLTokenizer extends TokenStream {
break;
case XMLStreamConstants.CHARACTERS:
Characters chars = event.asCharacters();
+ if (chars.isWhiteSpace())
+ break; // don't care
delegate = new OffsetTokenFilter(
analyzer.tokenStream(langs.getCurrent(),
null, new StringReader(chars.getData())),