src/main/java/au/com/miskinhill/search/analysis/XMLTokenizer.java (4585B) - raw
1 package au.com.miskinhill.search.analysis;
2
3 import java.io.IOException;
4 import java.io.InputStream;
5 import java.io.Reader;
6 import java.io.StringReader;
7 import java.util.Stack;
8
9 import javax.xml.XMLConstants;
10 import javax.xml.namespace.QName;
11 import javax.xml.stream.XMLEventReader;
12 import javax.xml.stream.XMLInputFactory;
13 import javax.xml.stream.XMLStreamConstants;
14 import javax.xml.stream.XMLStreamException;
15 import javax.xml.stream.events.Attribute;
16 import javax.xml.stream.events.Characters;
17 import javax.xml.stream.events.StartElement;
18 import javax.xml.stream.events.XMLEvent;
19
20 import org.apache.lucene.analysis.TokenStream;
21
22 public class XMLTokenizer extends TokenStream {
23
24 private static final XMLInputFactory factory = XMLInputFactory.newInstance();
25 static {
26 factory.setProperty("javax.xml.stream.isCoalescing", true);
27 }
28 public static XMLInputFactory getXMLInputFactory() {
29 return factory;
30 }
31
32 private static final String XHTML_NS_URI = "http://www.w3.org/1999/xhtml";
33
34 private static class LangStack extends Stack<String> {
35 private static final long serialVersionUID = 7020093255092191463L;
36 private String current = null;
37
38 public LangStack() {
39 }
40
41 @Override
42 public String push(String item) {
43 if (item != null)
44 current = item;
45 super.push(current);
46 return item;
47 }
48
49 @Override
50 public synchronized String pop() {
51 String top = super.pop();
52 current = empty() ? null : peek();
53 return top;
54 }
55
56 public String getCurrent() {
57 return current;
58 }
59 }
60
61 private XMLEventReader r;
62 private PerLanguageAnalyzerMap analyzerMap;
63 private LangStack langs = new LangStack();
64
65 /** Current delegate in use (null if none currently) */
66 private TokenStream delegate = null;
67
68 public XMLTokenizer(Reader reader, PerLanguageAnalyzerMap analyzerMap) throws XMLStreamException {
69 this.analyzerMap = analyzerMap;
70 r = factory.createXMLEventReader(reader);
71 }
72
73 public XMLTokenizer(InputStream in, PerLanguageAnalyzerMap analyzerMap) throws XMLStreamException {
74 this.analyzerMap = analyzerMap;
75 r = factory.createXMLEventReader(in);
76 }
77
78 @Override
79 public final boolean incrementToken() throws IOException {
80 clearAttributes();
81
82 // first try our current string delegate, if we have one
83 if (delegate != null) {
84 if (delegate.incrementToken())
85 return true;
86 else
87 delegate = null;
88 }
89
90 while (r.hasNext()) {
91 XMLEvent event;
92 try {
93 event = r.nextEvent();
94 } catch (XMLStreamException e) {
95 throw new IOException(e);
96 }
97 switch (event.getEventType()) {
98 case XMLStreamConstants.START_ELEMENT:
99 StartElement se = event.asStartElement();
100 langs.push(getLang(se));
101 break;
102 case XMLStreamConstants.CHARACTERS:
103 Characters chars = event.asCharacters();
104 if (chars.isWhiteSpace())
105 break; // don't care
106 Analyzer analyzer = analyzerMap.getAnalyzer(langs.getCurrent());
107 delegate = new OffsetTokenFilter(analyzer.applyFilters(analyzer.tokenizer(this, new StringReader(
108 chars.getData()))), event.getLocation().getCharacterOffset());
109 if (delegate.incrementToken())
110 return true;
111 else
112 delegate = null;
113 break;
114 case XMLStreamConstants.END_ELEMENT:
115 langs.pop();
116 break;
117 }
118 }
119 return false;
120 }
121
122 private String getLang(StartElement se) {
123 // xml:lang takes precedence
124 QName xmlLangQName = new QName(
125 se.getNamespaceURI("") == XMLConstants.XML_NS_URI ? "" : XMLConstants.XML_NS_URI, "lang");
126 Attribute xmlLang = se.getAttributeByName(xmlLangQName);
127 if (xmlLang != null)
128 return xmlLang.getValue();
129
130 QName xhtmlLangQName = new QName(se.getNamespaceURI("") == XHTML_NS_URI ? "" : XHTML_NS_URI, "lang");
131 Attribute xhtmlLang = se.getAttributeByName(xhtmlLangQName);
132 if (xhtmlLang != null)
133 return xhtmlLang.getValue();
134
135 return null;
136 }
137
138 }