public class SimpleXMLCollection extends Object implements Collection
Properties:
Modifier and Type | Field and Description |
---|---|
protected static boolean |
bReformXML
Reform invalid XML by copying to temporary file.
|
protected DocumentBuilderFactory |
dbFactory
The xml parser factory for DOM
|
protected DocumentBuilder |
dBuilder
the xml parser
|
protected HashSet<String> |
DocIDBlacklist
A black list of document to ignore.
|
protected boolean |
DocIdIsAttribute
set if DocIdLocation contains ELEMENT_ATTR_SEPARATOR
|
protected String |
DocIdLocation
Contains the name of the tag that contains the document name
|
protected HashSet<String> |
DocumentElements
Contains the names of tags that encapsulate entire documents
|
protected LinkedList<org.terrier.indexing.SimpleXMLCollection.XMLDocument> |
Documents
A list of all the document objects in this XML file
|
protected boolean |
DocumentTags
Set if DocumentElements.size > 0
|
static String |
ELEMENT_ATTR_SEPARATOR
element attribute separator
|
protected boolean |
EOC |
protected LinkedList<String> |
FilesToProcess
The list of files to process.
|
protected static org.apache.log4j.Logger |
logger |
protected boolean |
PropertiesInAttibutes
set if any PropertyElements contains ELEMENT_ATTR_SEPARATOR
|
protected Map<String,Integer> |
PropertyElements
Contains the names of tags and attributes that encapsulate meta properties with their lengths
|
protected HashSet<String> |
TermElements
Contains the names of tags and attributes that encapsulate terms
|
protected boolean |
TermsInAttributes
set if any TermElements contains ELEMENT_ATTR_SEPARATOR
|
protected org.terrier.indexing.SimpleXMLCollection.XMLDocument |
thisDoc
the current XML document that is being read by the indexer
|
protected Document |
xmlDoc
the parsed structure of the XML file we currently have open
|
Constructor and Description |
---|
SimpleXMLCollection()
Construct a SimpleXMLCollection
|
SimpleXMLCollection(List<String> filesToProcess)
Construct a SimpleXMLCollection
|
SimpleXMLCollection(String CollectionSpecFilename,
String BlacklistSpecFilename)
Construct a SimpleXMLCollection
|
Modifier and Type | Method and Description |
---|---|
void |
close()
This is not supported in this implemented class.
|
boolean |
endOfCollection()
Returns true if the end of the collection has been reached
|
protected boolean |
findDocumentElement(Node n) |
Document |
getDocument()
Get the document object representing the current document.
|
boolean |
hasNext()
Chech whether there is a next document in the collection
|
protected void |
initialiseParser() |
protected void |
initialiseTags() |
static void |
main(String[] args)
main
|
Document |
next()
get the next document
|
boolean |
nextDocument()
Move the collection to the start of the next document.
|
protected boolean |
openNextFile() |
void |
remove()
This is unsupported by this Collection implementation, and
any calls will throw UnsupportedOperationException
Throws UnsupportedOperationException on all invocations
|
void |
reset()
Resets the Collection iterator to the start of the collection.
|
protected static final org.apache.log4j.Logger logger
public static final String ELEMENT_ATTR_SEPARATOR
protected static final boolean bReformXML
protected HashSet<String> DocumentElements
protected boolean DocumentTags
protected HashSet<String> TermElements
protected String DocIdLocation
protected boolean DocIdIsAttribute
protected boolean TermsInAttributes
protected boolean PropertiesInAttibutes
protected Map<String,Integer> PropertyElements
protected DocumentBuilderFactory dbFactory
protected DocumentBuilder dBuilder
protected Document xmlDoc
protected LinkedList<org.terrier.indexing.SimpleXMLCollection.XMLDocument> Documents
protected org.terrier.indexing.SimpleXMLCollection.XMLDocument thisDoc
protected boolean EOC
protected LinkedList<String> FilesToProcess
public SimpleXMLCollection(List<String> filesToProcess)
filesToProcess
- public SimpleXMLCollection()
protected void initialiseParser()
protected void initialiseTags()
public void close()
close
in interface Closeable
close
in interface AutoCloseable
public boolean hasNext()
public Document next()
public void remove()
public boolean endOfCollection()
endOfCollection
in interface Collection
public boolean nextDocument()
nextDocument
in interface Collection
protected boolean findDocumentElement(Node n)
public Document getDocument()
getDocument
in interface Collection
public void reset()
reset
in interface Collection
protected boolean openNextFile()
public static void main(String[] args) throws IOException
args
- IOException
Terrier 4.0. Copyright © 2004-2014 University of Glasgow