Index: src/core/org/terrier/indexing/SimpleXMLCollection.java =================================================================== --- src/core/org/terrier/indexing/SimpleXMLCollection.java (revision 3205) +++ src/core/org/terrier/indexing/SimpleXMLCollection.java (working copy) @@ -31,6 +31,7 @@ import java.io.File; import java.io.IOException; import java.io.Reader; +import java.io.StringReader; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; @@ -43,6 +44,8 @@ import javax.xml.parsers.DocumentBuilderFactory; import org.apache.log4j.Logger; +import org.terrier.indexing.tokenisation.TokenStream; +import org.terrier.indexing.tokenisation.Tokeniser; import org.terrier.utility.ApplicationSetup; import org.terrier.utility.Files; import org.w3c.dom.NamedNodeMap; @@ -83,6 +86,10 @@ private String currentHolder = null; private int termNumber = 0; protected String ThisDocId = null; + + protected Tokeniser tokeniser = Tokeniser.getTokeniser(); + protected TokenStream currentTokenStream; + public XMLDocument(Node root) { this.doRecursive(root); @@ -217,35 +224,15 @@ if(lowercase) t = t.toLowerCase(); - //initialise the stringbuffer with the maximum length of a term (heuristic) - StringBuilder sw = new StringBuilder(tokenMaximumLength); - for(int i=0;i= 'A' && ch <= 'Z') - || (ch >= 'a' && ch <= 'z') - || (ch >= '0' && ch <= '9')))) - { - sw.append(ch); - } - else if (sw.length() > 0) - { - String term = sw.toString(); - if (term.length() > tokenMaximumLength) - term = term.substring(0, tokenMaximumLength); + currentTokenStream = tokeniser.tokenise(new StringReader(t)); + + while (currentTokenStream.hasNext()) { + String term = currentTokenStream.next(); + if (term != null) { terms.add(term); pushField(currentHolder); - sw = new StringBuilder(tokenMaximumLength); } } - if (sw.length() > 0) { - String term = sw.toString(); - if (term.length() > tokenMaximumLength) - term = term.substring(0, tokenMaximumLength); - terms.add(term); - pushField(currentHolder); - } } /** Returns true if no more terms can be fetched from this document */