src/main/java/au/com/miskinhill/search/analysis/XMLTokenizer.java (4585B) - raw
1 package au.com.miskinhill.search.analysis; 2 3 import java.io.IOException; 4 import java.io.InputStream; 5 import java.io.Reader; 6 import java.io.StringReader; 7 import java.util.Stack; 8 9 import javax.xml.XMLConstants; 10 import javax.xml.namespace.QName; 11 import javax.xml.stream.XMLEventReader; 12 import javax.xml.stream.XMLInputFactory; 13 import javax.xml.stream.XMLStreamConstants; 14 import javax.xml.stream.XMLStreamException; 15 import javax.xml.stream.events.Attribute; 16 import javax.xml.stream.events.Characters; 17 import javax.xml.stream.events.StartElement; 18 import javax.xml.stream.events.XMLEvent; 19 20 import org.apache.lucene.analysis.TokenStream; 21 22 public class XMLTokenizer extends TokenStream { 23 24 private static final XMLInputFactory factory = XMLInputFactory.newInstance(); 25 static { 26 factory.setProperty("javax.xml.stream.isCoalescing", true); 27 } 28 public static XMLInputFactory getXMLInputFactory() { 29 return factory; 30 } 31 32 private static final String XHTML_NS_URI = "http://www.w3.org/1999/xhtml"; 33 34 private static class LangStack extends Stack<String> { 35 private static final long serialVersionUID = 7020093255092191463L; 36 private String current = null; 37 38 public LangStack() { 39 } 40 41 @Override 42 public String push(String item) { 43 if (item != null) 44 current = item; 45 super.push(current); 46 return item; 47 } 48 49 @Override 50 public synchronized String pop() { 51 String top = super.pop(); 52 current = empty() ? null : peek(); 53 return top; 54 } 55 56 public String getCurrent() { 57 return current; 58 } 59 } 60 61 private XMLEventReader r; 62 private PerLanguageAnalyzerMap analyzerMap; 63 private LangStack langs = new LangStack(); 64 65 /** Current delegate in use (null if none currently) */ 66 private TokenStream delegate = null; 67 68 public XMLTokenizer(Reader reader, PerLanguageAnalyzerMap analyzerMap) throws XMLStreamException { 69 this.analyzerMap = analyzerMap; 70 r = factory.createXMLEventReader(reader); 71 } 72 73 public XMLTokenizer(InputStream in, PerLanguageAnalyzerMap analyzerMap) throws XMLStreamException { 74 this.analyzerMap = analyzerMap; 75 r = factory.createXMLEventReader(in); 76 } 77 78 @Override 79 public final boolean incrementToken() throws IOException { 80 clearAttributes(); 81 82 // first try our current string delegate, if we have one 83 if (delegate != null) { 84 if (delegate.incrementToken()) 85 return true; 86 else 87 delegate = null; 88 } 89 90 while (r.hasNext()) { 91 XMLEvent event; 92 try { 93 event = r.nextEvent(); 94 } catch (XMLStreamException e) { 95 throw new IOException(e); 96 } 97 switch (event.getEventType()) { 98 case XMLStreamConstants.START_ELEMENT: 99 StartElement se = event.asStartElement(); 100 langs.push(getLang(se)); 101 break; 102 case XMLStreamConstants.CHARACTERS: 103 Characters chars = event.asCharacters(); 104 if (chars.isWhiteSpace()) 105 break; // don't care 106 Analyzer analyzer = analyzerMap.getAnalyzer(langs.getCurrent()); 107 delegate = new OffsetTokenFilter(analyzer.applyFilters(analyzer.tokenizer(this, new StringReader( 108 chars.getData()))), event.getLocation().getCharacterOffset()); 109 if (delegate.incrementToken()) 110 return true; 111 else 112 delegate = null; 113 break; 114 case XMLStreamConstants.END_ELEMENT: 115 langs.pop(); 116 break; 117 } 118 } 119 return false; 120 } 121 122 private String getLang(StartElement se) { 123 // xml:lang takes precedence 124 QName xmlLangQName = new QName( 125 se.getNamespaceURI("") == XMLConstants.XML_NS_URI ? "" : XMLConstants.XML_NS_URI, "lang"); 126 Attribute xmlLang = se.getAttributeByName(xmlLangQName); 127 if (xmlLang != null) 128 return xmlLang.getValue(); 129 130 QName xhtmlLangQName = new QName(se.getNamespaceURI("") == XHTML_NS_URI ? "" : XHTML_NS_URI, "lang"); 131 Attribute xhtmlLang = se.getAttributeByName(xhtmlLangQName); 132 if (xhtmlLang != null) 133 return xhtmlLang.getValue(); 134 135 return null; 136 } 137 138 }