public class LexiconBuilder extends Object
Modifier and Type | Class and Description |
---|---|
static class |
LexiconBuilder.BasicLexiconCollectionStaticticsCounter
counts global statistics in the non-fields case
|
static interface |
LexiconBuilder.CollectionStatisticsCounter
Counter of LexiconEntries
|
protected static class |
LexiconBuilder.FieldLexiconCollectionStaticticsCounter
counts global statistics in the fields case
|
protected static class |
LexiconBuilder.NullCollectionStatisticsCounter |
Modifier and Type | Field and Description |
---|---|
protected String |
defaultStructureName |
protected int |
DocCount
How many documents have been processed so far.
|
protected static int |
DocumentsPerLexicon
The number of documents for which a temporary lexicon is created.
|
protected IndexOnDisk |
index |
protected String |
indexPath
The directory to write the final lexicons to
|
protected String |
indexPrefix
The filename of the lexicons.
|
protected String |
lexiconEntryFactoryValueClass |
protected Class<? extends LexiconOutputStream> |
lexiconOutputStream
class to be used as a lexiconoutpustream.
|
protected static org.slf4j.Logger |
logger
The logger used for this class
|
protected static int |
MAXLEXMERGE
Number of lexicons to merge at once.
|
protected static boolean |
MERGE2LEXATTIME
Should we only merge lexicons in pairs (Terrier 1.0.x scheme)? Set by property lexicon.builder.merge.2lex.attime
|
protected LexiconMap |
TempLex
The lexicontree to write the current term stream to
|
protected int |
TempLexCount
How many temporary lexicons have been generated so far
|
protected LinkedList<String> |
tempLexFiles
The linkedlist in which the temporary lexicon structure names are stored.
|
protected TermCodes |
termCodes |
protected int |
TermCount
How many terms are in the final lexicon
|
protected FixedSizeWriteableFactory<LexiconEntry> |
valueFactory |
Constructor and Description |
---|
LexiconBuilder(IndexOnDisk i,
String _structureName,
Class<? extends LexiconMap> _LexiconMapClass,
String _lexiconEntryClass,
TermCodes termCodes)
constructor
|
LexiconBuilder(IndexOnDisk i,
String _structureName,
LexiconMap lexiconMap,
String _lexiconEntryClass,
String valueFactoryParamTypes,
String valueFactoryParamValues,
TermCodes _termCodes)
constructor
|
LexiconBuilder(IndexOnDisk i,
String _structureName,
LexiconMap lexiconMap,
String _lexiconEntryClass,
TermCodes termCodes)
constructor
|
LexiconBuilder(IndexOnDisk i,
String _structureName,
TermCodes tc)
constructor
|
Modifier and Type | Method and Description |
---|---|
void |
addDocumentTerms(DocumentPostingList terms)
adds the terms of a document to the temporary lexicon in memory.
|
void |
addTemporaryLexicon(String structureName)
Deprecated.
|
void |
addTerm(String term,
int tf)
Add a single term to the lexicon being built
|
static void |
createLexiconHash(IndexOnDisk index)
Deprecated.
use optimise instead
|
static void |
createLexiconIndex(IndexOnDisk index)
Deprecated.
use optimise instead
|
void |
finishedDirectIndexBuild()
Processing the lexicon after finished creating the
direct and document indexes.
|
void |
finishedInvertedIndexBuild()
Processing the lexicon after finished creating the
inverted index.
|
void |
flush()
Force a temporary lexicon to be flushed
|
int |
getFinalNumberOfTerms()
Returns the number of terms in the final lexicon.
|
protected Iterator<Map.Entry<String,LexiconEntry>> |
getLexInputStream(String structureName)
return the lexicon input stream for the current index at the specified filename
|
protected LexiconOutputStream<String> |
getLexOutputStream(String structureName)
return the lexicon outputstream for the current index at the specified filename
|
protected static LexiconMap |
instantiate(Class<? extends LexiconMap> LexiconMapClass) |
void |
merge(LinkedList<String> filesToMerge)
Merges the intermediate lexicon files created during the indexing.
|
protected void |
mergeNLexicons(Iterator<Map.Entry<String,LexiconEntry>>[] lis,
LexiconOutputStream<String> los) |
protected void |
mergeTwoLexicons(Iterator<Map.Entry<String,LexiconEntry>> lis1,
Iterator<Map.Entry<String,LexiconEntry>> lis2,
LexiconOutputStream<String> los)
Merge the two LexiconInputStreams into the given LexiconOutputStream
|
protected LexiconEntry |
newLexiconEntry(int termid) |
static void |
optimise(IndexOnDisk index,
String structureName)
Optimises the lexicon, eg lexid file
|
void |
optimiseLexicon()
optimise the lexicon
|
static void |
reAssignTermIds(IndexOnDisk index,
String structureName,
int numEntries)
Re-assigned the termids within the named lexicon structure to be ascending with
descending term frequency, i.e.
|
protected void |
writeTemporaryLexicon()
Writes the current contents of TempLex temporary lexicon binary tree down to
a temporary disk lexicon.
|
protected Class<? extends LexiconOutputStream> lexiconOutputStream
protected final String lexiconEntryFactoryValueClass
protected static final org.slf4j.Logger logger
protected int DocCount
protected int TermCount
protected static final int DocumentsPerLexicon
protected final LinkedList<String> tempLexFiles
protected LexiconMap TempLex
protected TermCodes termCodes
protected String indexPath
protected String indexPrefix
protected IndexOnDisk index
protected int TempLexCount
protected static final boolean MERGE2LEXATTIME
protected static final int MAXLEXMERGE
protected String defaultStructureName
protected FixedSizeWriteableFactory<LexiconEntry> valueFactory
public LexiconBuilder(IndexOnDisk i, String _structureName, TermCodes tc)
i
- _structureName
- public LexiconBuilder(IndexOnDisk i, String _structureName, Class<? extends LexiconMap> _LexiconMapClass, String _lexiconEntryClass, TermCodes termCodes)
i
- _structureName
- _LexiconMapClass
- _lexiconEntryClass
- public LexiconBuilder(IndexOnDisk i, String _structureName, LexiconMap lexiconMap, String _lexiconEntryClass, TermCodes termCodes)
i
- _structureName
- lexiconMap
- _lexiconEntryClass
- public LexiconBuilder(IndexOnDisk i, String _structureName, LexiconMap lexiconMap, String _lexiconEntryClass, String valueFactoryParamTypes, String valueFactoryParamValues, TermCodes _termCodes)
i
- _structureName
- lexiconMap
- _lexiconEntryClass
- valueFactoryParamTypes
- valueFactoryParamValues
- protected static LexiconMap instantiate(Class<? extends LexiconMap> LexiconMapClass)
public int getFinalNumberOfTerms()
public void addTemporaryLexicon(String structureName)
structureName
- Fully path to a lexicon to mergeprotected void writeTemporaryLexicon()
public void addTerm(String term, int tf)
term
- The String termtf
- the frequency of the termpublic void addDocumentTerms(DocumentPostingList terms)
terms
- DocumentPostingList the terms of the document to add to the temporary lexiconpublic void flush()
public void finishedInvertedIndexBuild()
public void finishedDirectIndexBuild()
public void merge(LinkedList<String> filesToMerge) throws IOException
filesToMerge
- java.util.LinkedList the list containing the
filenames of the temporary files.IOException
- an input/output exception is throws
if a problem is encountered.protected LexiconEntry newLexiconEntry(int termid)
protected void mergeNLexicons(Iterator<Map.Entry<String,LexiconEntry>>[] lis, LexiconOutputStream<String> los) throws IOException
IOException
protected void mergeTwoLexicons(Iterator<Map.Entry<String,LexiconEntry>> lis1, Iterator<Map.Entry<String,LexiconEntry>> lis2, LexiconOutputStream<String> los) throws IOException
lis1
- First lexicon to be mergedlis2
- Second lexicon to be mergedlos
- Lexion to be merged toIOException
public static void createLexiconIndex(IndexOnDisk index) throws IOException
index
- IndexOnDisk to make the lexicon index forIOException
public static void createLexiconHash(IndexOnDisk index) throws IOException
index
- IndexOnDisk to make the LexiconHash the lexicoinIOException
public void optimiseLexicon()
public static void optimise(IndexOnDisk index, String structureName)
public static void reAssignTermIds(IndexOnDisk index, String structureName, int numEntries) throws IOException
index
- structureName
- numEntries
- IOException
protected Iterator<Map.Entry<String,LexiconEntry>> getLexInputStream(String structureName) throws IOException
IOException
protected LexiconOutputStream<String> getLexOutputStream(String structureName) throws IOException
IOException
Terrier Information Retrieval Platform 5.1. Copyright © 2004-2019, University of Glasgow