|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object org.terrier.structures.indexing.LexiconBuilder
public class LexiconBuilder
Builds temporary lexicons during indexing a collection and merges them when the indexing of a collection has finished.
Nested Class Summary | |
---|---|
static class |
LexiconBuilder.BasicLexiconCollectionStaticticsCounter
counts global statistics in the non-fields case |
static interface |
LexiconBuilder.CollectionStatisticsCounter
Counter of LexiconEntries |
protected static class |
LexiconBuilder.FieldLexiconCollectionStaticticsCounter
counts global statistics in the fields case |
protected static class |
LexiconBuilder.NullCollectionStatisticsCounter
|
Field Summary | |
---|---|
protected java.lang.String |
defaultStructureName
|
protected int |
DocCount
How many documents have been processed so far. |
protected static int |
DocumentsPerLexicon
The number of documents for which a temporary lexicon is created. |
protected Index |
index
|
protected java.lang.String |
indexPath
The directory to write the final lexicons to |
protected java.lang.String |
indexPrefix
The filename of the lexicons. |
protected java.lang.String |
lexiconEntryFactoryValueClass
|
protected java.lang.Class<? extends LexiconOutputStream> |
lexiconOutputStream
class to be used as a lexiconoutpustream. |
protected static org.apache.log4j.Logger |
logger
The logger used for this class |
protected static int |
MAXLEXMERGE
Number of lexicons to merge at once. |
protected static boolean |
MERGE2LEXATTIME
Should we only merge lexicons in pairs (Terrier 1.0.x scheme)? Set by property lexicon.builder.merge.2lex.attime |
protected LexiconMap |
TempLex
The lexicontree to write the current term stream to |
protected int |
TempLexCount
How many temporary lexicons have been generated so far |
protected java.util.LinkedList<java.lang.String> |
tempLexFiles
The linkedlist in which the temporary lexicon structure names are stored. |
protected int |
TermCount
How many terms are in the final lexicon |
protected FixedSizeWriteableFactory<LexiconEntry> |
valueFactory
|
Constructor Summary | |
---|---|
LexiconBuilder(Index i,
java.lang.String _structureName)
constructor |
|
LexiconBuilder(Index i,
java.lang.String _structureName,
java.lang.Class<? extends LexiconMap> _LexiconMapClass,
java.lang.String _lexiconEntryClass)
constructor |
|
LexiconBuilder(Index i,
java.lang.String _structureName,
LexiconMap lexiconMap,
java.lang.String _lexiconEntryClass)
constructor |
Method Summary | |
---|---|
void |
addDocumentTerms(DocumentPostingList terms)
adds the terms of a document to the temporary lexicon in memory. |
void |
addTemporaryLexicon(java.lang.String structureName)
Deprecated. |
void |
addTerm(java.lang.String term,
int tf)
Add a single term to the lexicon being built |
static void |
createLexiconHash(Index index)
Deprecated. use optimise instead |
static void |
createLexiconIndex(Index index)
Deprecated. use optimise instead |
void |
finishedDirectIndexBuild()
Processing the lexicon after finished creating the direct and document indexes. |
void |
finishedInvertedIndexBuild()
Processing the lexicon after finished creating the inverted index. |
void |
flush()
Force a temporary lexicon to be flushed |
int |
getFinalNumberOfTerms()
Returns the number of terms in the final lexicon. |
protected java.util.Iterator<java.util.Map.Entry<java.lang.String,LexiconEntry>> |
getLexInputStream(java.lang.String structureName)
return the lexicon input stream for the current index at the specified filename |
protected LexiconOutputStream<java.lang.String> |
getLexOutputStream(java.lang.String structureName)
return the lexicon outputstream or the current index at the specified filename |
protected static LexiconMap |
instantiate(java.lang.Class<? extends LexiconMap> LexiconMapClass)
|
void |
merge(java.util.LinkedList<java.lang.String> filesToMerge)
Merges the intermediate lexicon files created during the indexing. |
protected void |
mergeNLexicons(java.util.Iterator<java.util.Map.Entry<java.lang.String,LexiconEntry>>[] lis,
LexiconOutputStream<java.lang.String> los)
|
protected void |
mergeTwoLexicons(java.util.Iterator<java.util.Map.Entry<java.lang.String,LexiconEntry>> lis1,
java.util.Iterator<java.util.Map.Entry<java.lang.String,LexiconEntry>> lis2,
LexiconOutputStream<java.lang.String> los)
Merge the two LexiconInputStreams into the given LexiconOutputStream |
protected LexiconEntry |
newLexiconEntry(int termid)
|
static void |
optimise(Index index,
java.lang.String structureName)
Optimises the lexicon, eg lexid file |
void |
optimiseLexicon()
optimise the lexicon |
protected void |
writeTemporaryLexicon()
Writes the current contents of TempLex temporary lexicon binary tree down to a temporary disk lexicon. |
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
---|
protected java.lang.Class<? extends LexiconOutputStream> lexiconOutputStream
protected final java.lang.String lexiconEntryFactoryValueClass
protected static final org.apache.log4j.Logger logger
protected int DocCount
protected int TermCount
protected static final int DocumentsPerLexicon
protected final java.util.LinkedList<java.lang.String> tempLexFiles
protected LexiconMap TempLex
protected java.lang.String indexPath
protected java.lang.String indexPrefix
protected Index index
protected int TempLexCount
protected static final boolean MERGE2LEXATTIME
protected static final int MAXLEXMERGE
protected java.lang.String defaultStructureName
protected FixedSizeWriteableFactory<LexiconEntry> valueFactory
Constructor Detail |
---|
public LexiconBuilder(Index i, java.lang.String _structureName)
i
- _structureName
- public LexiconBuilder(Index i, java.lang.String _structureName, java.lang.Class<? extends LexiconMap> _LexiconMapClass, java.lang.String _lexiconEntryClass)
i
- _structureName
- _LexiconMapClass
- _lexiconEntryClass
- public LexiconBuilder(Index i, java.lang.String _structureName, LexiconMap lexiconMap, java.lang.String _lexiconEntryClass)
i
- _structureName
- lexiconMap
- _lexiconEntryClass
- Method Detail |
---|
protected static LexiconMap instantiate(java.lang.Class<? extends LexiconMap> LexiconMapClass)
public int getFinalNumberOfTerms()
public void addTemporaryLexicon(java.lang.String structureName)
structureName
- Fully path to a lexicon to mergeprotected void writeTemporaryLexicon()
public void addTerm(java.lang.String term, int tf)
term
- The String termtf
- the frequency of the termpublic void addDocumentTerms(DocumentPostingList terms)
terms
- DocumentPostingList the terms of the document to add to the temporary lexiconpublic void flush()
public void finishedInvertedIndexBuild()
public void finishedDirectIndexBuild()
public void merge(java.util.LinkedList<java.lang.String> filesToMerge) throws java.io.IOException
filesToMerge
- java.util.LinkedList the list containing the
filenames of the temporary files.
java.io.IOException
- an input/output exception is throws
if a problem is encountered.protected LexiconEntry newLexiconEntry(int termid)
protected void mergeNLexicons(java.util.Iterator<java.util.Map.Entry<java.lang.String,LexiconEntry>>[] lis, LexiconOutputStream<java.lang.String> los) throws java.io.IOException
java.io.IOException
protected void mergeTwoLexicons(java.util.Iterator<java.util.Map.Entry<java.lang.String,LexiconEntry>> lis1, java.util.Iterator<java.util.Map.Entry<java.lang.String,LexiconEntry>> lis2, LexiconOutputStream<java.lang.String> los) throws java.io.IOException
lis1
- First lexicon to be mergedlis2
- Second lexicon to be mergedlos
- Lexion to be merged to
java.io.IOException
public static void createLexiconIndex(Index index) throws java.io.IOException
index
- Index to make the lexicon index for
java.io.IOException
public static void createLexiconHash(Index index) throws java.io.IOException
index
- Index to make the LexiconHash the lexicoin
java.io.IOException
public void optimiseLexicon()
public static void optimise(Index index, java.lang.String structureName)
protected java.util.Iterator<java.util.Map.Entry<java.lang.String,LexiconEntry>> getLexInputStream(java.lang.String structureName) throws java.io.IOException
java.io.IOException
protected LexiconOutputStream<java.lang.String> getLexOutputStream(java.lang.String structureName) throws java.io.IOException
java.io.IOException
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |