commit 0a2064846a3ae9d75f46882a0b37801c3bebba88 parent ebc7e7059f3dd5e720925d904b70f1009b02bb33 Author: Dan Callaghan <djc@djc.id.au> Date: Wed, 31 Dec 2008 13:48:42 +1000 rearranged stuff --HG-- extra : convert_revision : 1980d8e814176078fee09c70124009cf37a204a0 Diffstat:
A | src/main/java/au/com/miskinhill/search/analysis/OffsetTokenFilter.java | | | 28 | ++++++++++++++++++++++++++++ |
A | src/main/java/au/com/miskinhill/search/analysis/XMLTokenizer.java | | | 130 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
2 files changed, 158 insertions(+), 0 deletions(-) diff --git a/src/main/java/au/com/miskinhill/search/analysis/OffsetTokenFilter.java b/src/main/java/au/com/miskinhill/search/analysis/OffsetTokenFilter.java @@ -0,0 +1,28 @@ +package au.com.miskinhill.search.analysis; + +import java.io.IOException; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; + +public class OffsetTokenFilter extends TokenFilter { + + private int offset; + + protected OffsetTokenFilter(TokenStream input, int offset) { + super(input); + this.offset = offset; + } + + @Override + public Token next(Token reusableToken) throws IOException { + Token retval = input.next(reusableToken); + if (retval != null && offset != 0) { + retval.setStartOffset(retval.startOffset() + offset); + retval.setEndOffset(retval.endOffset() + offset); + } + return retval; + } + +} diff --git a/src/main/java/au/com/miskinhill/search/analysis/XMLTokenizer.java b/src/main/java/au/com/miskinhill/search/analysis/XMLTokenizer.java @@ -0,0 +1,130 @@ +package au.com.miskinhill.search.analysis; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.io.StringReader; +import java.util.Stack; + +import javax.xml.XMLConstants; +import javax.xml.namespace.QName; +import javax.xml.stream.XMLEventReader; +import javax.xml.stream.XMLInputFactory; +import javax.xml.stream.XMLStreamConstants; +import javax.xml.stream.XMLStreamException; +import javax.xml.stream.events.Attribute; +import javax.xml.stream.events.Characters; +import javax.xml.stream.events.StartElement; +import javax.xml.stream.events.XMLEvent; + +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; + +public class XMLTokenizer extends TokenStream { + + private static final XMLInputFactory factory = XMLInputFactory.newInstance(); + static { + factory.setProperty("javax.xml.stream.isCoalescing", true); + } + + private static final String XHTML_NS_URI = "http://www.w3.org/1999/xhtml"; + + private static class LangStack extends Stack<String> { + private static final long serialVersionUID = 7020093255092191463L; + private String current = null; + public String push(String item) { + if (item != null) + current = item; + super.push(current); + return item; + } + public synchronized String pop() { + String top = super.pop(); + current = empty() ? null : peek(); + return top; + } + public String getCurrent() { + return current; + } + } + + private XMLEventReader r; + private PerLanguageAnalyzerWrapper analyzer; + private LangStack langs = new LangStack(); + + /** Current delegate in use (null if none currently) */ + private TokenStream delegate = null; + + public XMLTokenizer(Reader reader, PerLanguageAnalyzerWrapper analyzer) throws XMLStreamException { + this.analyzer = analyzer; + r = factory.createXMLEventReader(reader); + } + + public XMLTokenizer(InputStream in) throws XMLStreamException { + XMLInputFactory factory = XMLInputFactory.newInstance(); + factory.setProperty("javax.xml.stream.isCoalescing", true); + r = factory.createXMLEventReader(in); + } + + public Token next(Token reusableToken) throws IOException { + // first try our current string delegate, if we have one + if (delegate != null) { + Token retval = delegate.next(reusableToken); + if (retval != null) + return retval; + else + delegate = null; + } + + while (r.hasNext()) { + XMLEvent event; + try { + event = r.nextEvent(); + } catch (XMLStreamException e) { + throw new IOException(e); + } + switch (event.getEventType()) { + case XMLStreamConstants.START_ELEMENT: + StartElement se = event.asStartElement(); + langs.push(getLang(se)); + break; + case XMLStreamConstants.CHARACTERS: + Characters chars = event.asCharacters(); + delegate = new OffsetTokenFilter( + analyzer.tokenStream(langs.getCurrent(), + null, new StringReader(chars.getData())), + event.getLocation().getCharacterOffset()); + Token retval = delegate.next(reusableToken); + if (retval != null) + return retval; + else + delegate = null; + break; + case XMLStreamConstants.END_ELEMENT: + langs.pop(); + break; + } + } + return null; + } + + private String getLang(StartElement se) { + // xml:lang takes precedence + QName xmlLangQName = new QName( + se.getNamespaceURI("") == XMLConstants.XML_NS_URI ? "" : XMLConstants.XML_NS_URI, + "lang"); + Attribute xmlLang = se.getAttributeByName(xmlLangQName); + if (xmlLang != null) + return xmlLang.getValue(); + + QName xhtmlLangQName = new QName( + se.getNamespaceURI("") == XHTML_NS_URI ? "" : XHTML_NS_URI, + "lang"); + Attribute xhtmlLang = se.getAttributeByName(xhtmlLangQName); + if (xhtmlLang != null) + return xhtmlLang.getValue(); + + return null; + } + +}