lucene-multilingual

Multilingual enhancements for the Lucene text search library
git clone https://code.djc.id.au/git/lucene-multilingual/

src/main/java/au/com/miskinhill/search/analysis/XMLTokenizer.java (4585B) - raw

      1 package au.com.miskinhill.search.analysis;
      2 
      3 import java.io.IOException;
      4 import java.io.InputStream;
      5 import java.io.Reader;
      6 import java.io.StringReader;
      7 import java.util.Stack;
      8 
      9 import javax.xml.XMLConstants;
     10 import javax.xml.namespace.QName;
     11 import javax.xml.stream.XMLEventReader;
     12 import javax.xml.stream.XMLInputFactory;
     13 import javax.xml.stream.XMLStreamConstants;
     14 import javax.xml.stream.XMLStreamException;
     15 import javax.xml.stream.events.Attribute;
     16 import javax.xml.stream.events.Characters;
     17 import javax.xml.stream.events.StartElement;
     18 import javax.xml.stream.events.XMLEvent;
     19 
     20 import org.apache.lucene.analysis.TokenStream;
     21 
     22 public class XMLTokenizer extends TokenStream {
     23 
     24     private static final XMLInputFactory factory = XMLInputFactory.newInstance();
     25     static {
     26         factory.setProperty("javax.xml.stream.isCoalescing", true);
     27     }
     28     public static XMLInputFactory getXMLInputFactory() {
     29         return factory;
     30     }
     31 
     32     private static final String XHTML_NS_URI = "http://www.w3.org/1999/xhtml";
     33 
     34     private static class LangStack extends Stack<String> {
     35         private static final long serialVersionUID = 7020093255092191463L;
     36         private String current = null;
     37 
     38         public LangStack() {
     39         }
     40 
     41         @Override
     42         public String push(String item) {
     43             if (item != null)
     44                 current = item;
     45             super.push(current);
     46             return item;
     47         }
     48 
     49         @Override
     50         public synchronized String pop() {
     51             String top = super.pop();
     52             current = empty() ? null : peek();
     53             return top;
     54         }
     55 
     56         public String getCurrent() {
     57             return current;
     58         }
     59     }
     60 
     61     private XMLEventReader r;
     62     private PerLanguageAnalyzerMap analyzerMap;
     63     private LangStack langs = new LangStack();
     64 
     65     /** Current delegate in use (null if none currently) */
     66     private TokenStream delegate = null;
     67 
     68     public XMLTokenizer(Reader reader, PerLanguageAnalyzerMap analyzerMap) throws XMLStreamException {
     69         this.analyzerMap = analyzerMap;
     70         r = factory.createXMLEventReader(reader);
     71     }
     72 
     73     public XMLTokenizer(InputStream in, PerLanguageAnalyzerMap analyzerMap) throws XMLStreamException {
     74         this.analyzerMap = analyzerMap;
     75         r = factory.createXMLEventReader(in);
     76     }
     77 
     78     @Override
     79     public final boolean incrementToken() throws IOException {
     80         clearAttributes();
     81 
     82         // first try our current string delegate, if we have one
     83         if (delegate != null) {
     84             if (delegate.incrementToken())
     85                 return true;
     86             else
     87                 delegate = null;
     88         }
     89 
     90         while (r.hasNext()) {
     91             XMLEvent event;
     92             try {
     93                 event = r.nextEvent();
     94             } catch (XMLStreamException e) {
     95                 throw new IOException(e);
     96             }
     97             switch (event.getEventType()) {
     98                 case XMLStreamConstants.START_ELEMENT:
     99                     StartElement se = event.asStartElement();
    100                     langs.push(getLang(se));
    101                     break;
    102                 case XMLStreamConstants.CHARACTERS:
    103                     Characters chars = event.asCharacters();
    104                     if (chars.isWhiteSpace())
    105                         break; // don't care
    106                     Analyzer analyzer = analyzerMap.getAnalyzer(langs.getCurrent());
    107                     delegate = new OffsetTokenFilter(analyzer.applyFilters(analyzer.tokenizer(this, new StringReader(
    108                             chars.getData()))), event.getLocation().getCharacterOffset());
    109                     if (delegate.incrementToken())
    110                         return true;
    111                     else
    112                         delegate = null;
    113                     break;
    114                 case XMLStreamConstants.END_ELEMENT:
    115                     langs.pop();
    116                     break;
    117             }
    118         }
    119         return false;
    120     }
    121 
    122     private String getLang(StartElement se) {
    123         // xml:lang takes precedence
    124         QName xmlLangQName = new QName(
    125                 se.getNamespaceURI("") == XMLConstants.XML_NS_URI ? "" : XMLConstants.XML_NS_URI, "lang");
    126         Attribute xmlLang = se.getAttributeByName(xmlLangQName);
    127         if (xmlLang != null)
    128             return xmlLang.getValue();
    129 
    130         QName xhtmlLangQName = new QName(se.getNamespaceURI("") == XHTML_NS_URI ? "" : XHTML_NS_URI, "lang");
    131         Attribute xhtmlLang = se.getAttributeByName(xhtmlLangQName);
    132         if (xhtmlLang != null)
    133             return xhtmlLang.getValue();
    134 
    135         return null;
    136     }
    137 
    138 }