commit 0a2064846a3ae9d75f46882a0b37801c3bebba88 parent ebc7e7059f3dd5e720925d904b70f1009b02bb33 Author: Dan Callaghan <djc@djc.id.au> Date: Wed, 31 Dec 2008 13:48:42 +1000 rearranged stuff --HG-- extra : convert_revision : 1980d8e814176078fee09c70124009cf37a204a0 Diffstat:
| A | src/main/java/au/com/miskinhill/search/analysis/OffsetTokenFilter.java | | | 28 | ++++++++++++++++++++++++++++ |
| A | src/main/java/au/com/miskinhill/search/analysis/XMLTokenizer.java | | | 130 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
2 files changed, 158 insertions(+), 0 deletions(-)
diff --git a/src/main/java/au/com/miskinhill/search/analysis/OffsetTokenFilter.java b/src/main/java/au/com/miskinhill/search/analysis/OffsetTokenFilter.java
@@ -0,0 +1,28 @@
+package au.com.miskinhill.search.analysis;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+
+public class OffsetTokenFilter extends TokenFilter {
+
+ private int offset;
+
+ protected OffsetTokenFilter(TokenStream input, int offset) {
+ super(input);
+ this.offset = offset;
+ }
+
+ @Override
+ public Token next(Token reusableToken) throws IOException {
+ Token retval = input.next(reusableToken);
+ if (retval != null && offset != 0) {
+ retval.setStartOffset(retval.startOffset() + offset);
+ retval.setEndOffset(retval.endOffset() + offset);
+ }
+ return retval;
+ }
+
+}
diff --git a/src/main/java/au/com/miskinhill/search/analysis/XMLTokenizer.java b/src/main/java/au/com/miskinhill/search/analysis/XMLTokenizer.java
@@ -0,0 +1,130 @@
+package au.com.miskinhill.search.analysis;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.Stack;
+
+import javax.xml.XMLConstants;
+import javax.xml.namespace.QName;
+import javax.xml.stream.XMLEventReader;
+import javax.xml.stream.XMLInputFactory;
+import javax.xml.stream.XMLStreamConstants;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.events.Attribute;
+import javax.xml.stream.events.Characters;
+import javax.xml.stream.events.StartElement;
+import javax.xml.stream.events.XMLEvent;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+public class XMLTokenizer extends TokenStream {
+
+ private static final XMLInputFactory factory = XMLInputFactory.newInstance();
+ static {
+ factory.setProperty("javax.xml.stream.isCoalescing", true);
+ }
+
+ private static final String XHTML_NS_URI = "http://www.w3.org/1999/xhtml";
+
+ private static class LangStack extends Stack<String> {
+ private static final long serialVersionUID = 7020093255092191463L;
+ private String current = null;
+ public String push(String item) {
+ if (item != null)
+ current = item;
+ super.push(current);
+ return item;
+ }
+ public synchronized String pop() {
+ String top = super.pop();
+ current = empty() ? null : peek();
+ return top;
+ }
+ public String getCurrent() {
+ return current;
+ }
+ }
+
+ private XMLEventReader r;
+ private PerLanguageAnalyzerWrapper analyzer;
+ private LangStack langs = new LangStack();
+
+ /** Current delegate in use (null if none currently) */
+ private TokenStream delegate = null;
+
+ public XMLTokenizer(Reader reader, PerLanguageAnalyzerWrapper analyzer) throws XMLStreamException {
+ this.analyzer = analyzer;
+ r = factory.createXMLEventReader(reader);
+ }
+
+ public XMLTokenizer(InputStream in) throws XMLStreamException {
+ XMLInputFactory factory = XMLInputFactory.newInstance();
+ factory.setProperty("javax.xml.stream.isCoalescing", true);
+ r = factory.createXMLEventReader(in);
+ }
+
+ public Token next(Token reusableToken) throws IOException {
+ // first try our current string delegate, if we have one
+ if (delegate != null) {
+ Token retval = delegate.next(reusableToken);
+ if (retval != null)
+ return retval;
+ else
+ delegate = null;
+ }
+
+ while (r.hasNext()) {
+ XMLEvent event;
+ try {
+ event = r.nextEvent();
+ } catch (XMLStreamException e) {
+ throw new IOException(e);
+ }
+ switch (event.getEventType()) {
+ case XMLStreamConstants.START_ELEMENT:
+ StartElement se = event.asStartElement();
+ langs.push(getLang(se));
+ break;
+ case XMLStreamConstants.CHARACTERS:
+ Characters chars = event.asCharacters();
+ delegate = new OffsetTokenFilter(
+ analyzer.tokenStream(langs.getCurrent(),
+ null, new StringReader(chars.getData())),
+ event.getLocation().getCharacterOffset());
+ Token retval = delegate.next(reusableToken);
+ if (retval != null)
+ return retval;
+ else
+ delegate = null;
+ break;
+ case XMLStreamConstants.END_ELEMENT:
+ langs.pop();
+ break;
+ }
+ }
+ return null;
+ }
+
+ private String getLang(StartElement se) {
+ // xml:lang takes precedence
+ QName xmlLangQName = new QName(
+ se.getNamespaceURI("") == XMLConstants.XML_NS_URI ? "" : XMLConstants.XML_NS_URI,
+ "lang");
+ Attribute xmlLang = se.getAttributeByName(xmlLangQName);
+ if (xmlLang != null)
+ return xmlLang.getValue();
+
+ QName xhtmlLangQName = new QName(
+ se.getNamespaceURI("") == XHTML_NS_URI ? "" : XHTML_NS_URI,
+ "lang");
+ Attribute xhtmlLang = se.getAttributeByName(xhtmlLangQName);
+ if (xhtmlLang != null)
+ return xhtmlLang.getValue();
+
+ return null;
+ }
+
+}