|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |
java.lang.Object org.terrier.indexing.Indexer org.terrier.indexing.BasicIndexer org.terrier.indexing.BasicSinglePassIndexer org.terrier.indexing.hadoop.Hadoop_BasicSinglePassIndexer
public class Hadoop_BasicSinglePassIndexer
Single Pass MapReduce indexer.
Nested Class Summary |
---|
Nested classes/interfaces inherited from class org.terrier.indexing.BasicIndexer |
---|
BasicIndexer.BasicTermProcessor, BasicIndexer.FieldTermProcessor |
Field Summary | |
---|---|
protected org.apache.hadoop.mapred.Reporter |
currentReporter
|
protected java.util.LinkedList<java.lang.Integer> |
flushList
List of how many documents are in each flush we have made |
protected int |
flushNo
How many flushes have we made |
protected org.apache.hadoop.mapred.JobConf |
jc
JobConf of the current running job |
protected org.apache.hadoop.mapred.Reporter |
lastReporter
|
protected LexiconOutputStream<java.lang.String> |
lexstream
OutputStream for the Lexicon |
protected java.lang.String[] |
MapIndexPrefixes
|
protected java.lang.String |
mapTaskID
Current map number |
protected boolean |
mutipleIndices
|
protected org.apache.hadoop.mapred.OutputCollector<SplitEmittedTerm,MapEmittedPostingList> |
outputPostingListCollector
output collector for the current map indexing process |
protected int |
reduceId
|
protected boolean |
reduceStarted
records whether the reduce() has been called for the first time |
protected java.io.DataOutputStream |
RunData
OutputStream for the the data on the runs (runNo, flushes etc) |
protected HadoopRunIteratorFactory |
runIteratorF
runIterator factory being used to generate RunIterators |
protected int |
splitnum
The split that these documents came form |
protected boolean |
start
|
Fields inherited from class org.terrier.indexing.BasicIndexer |
---|
numOfTokensInDocument, termFields, termsInDocument |
Constructor Summary | |
---|---|
Hadoop_BasicSinglePassIndexer()
Empty constructor. |
Method Summary | |
---|---|
void |
close()
Called when the Map or Reduce task ends, to finish up the indexer. |
protected void |
closeMap()
Finish up the map processing. |
protected void |
closeReduce()
finishes the reduce step, by closing the lexicon and inverted file output, building the lexicon hash and index, and merging the document indices created by the map tasks. |
void |
configure(org.apache.hadoop.mapred.JobConf _jc)
Configure this indexer. |
protected void |
configureMap()
|
protected void |
configureReduce()
|
protected MetaIndexBuilder |
createMetaIndexBuilder()
|
protected RunsMerger |
createtheRunMerger()
Creates the RunsMerger and the RunIteratorFactory |
static void |
finish(java.lang.String destinationIndexPath,
int numberOfReduceTasks,
HadoopPlugin.JobFactory jf)
finish |
protected void |
forceFlush()
|
protected void |
indexEmpty(java.util.Map<java.lang.String,java.lang.String> docProperties)
Write the empty document to the inverted index |
protected void |
load_builder_boundary_documents()
Loads the builder boundary documents from the property indexing.builder.boundary.docnos, comma delimited. |
protected java.util.LinkedList<MapData> |
loadRunData()
|
static void |
main(java.lang.String[] args)
main |
void |
map(org.apache.hadoop.io.Text key,
SplitAwareWrapper<Document> value,
org.apache.hadoop.mapred.OutputCollector<SplitEmittedTerm,MapEmittedPostingList> _outputPostingListCollector,
org.apache.hadoop.mapred.Reporter reporter)
Map processes a single document. |
protected void |
mergeDocumentIndex(Index[] src)
Merges the simple document indexes made for each map, instead creating the final document index |
void |
reduce(SplitEmittedTerm Term,
java.util.Iterator<MapEmittedPostingList> postingIterator,
org.apache.hadoop.mapred.OutputCollector<java.lang.Object,java.lang.Object> output,
org.apache.hadoop.mapred.Reporter reporter)
Main reduce algorithm step. |
void |
startReduce(java.util.LinkedList<MapData> mapData)
Merge the postings for the current term, converts the document ID's in the postings to be relative to one another using the run number, number of documents covered in each run, the flush number for that run and the number of documents flushed. |
Methods inherited from class org.terrier.indexing.BasicSinglePassIndexer |
---|
checkFlush, createDirectIndex, createFieldRunMerger, createInvertedIndex, createInvertedIndex, createMemoryPostings, createRunMerger, finishMemoryPosting, getFileNames, indexDocument, load_indexer_properties, performMultiWayMerge |
Methods inherited from class org.terrier.indexing.BasicIndexer |
---|
createDocumentPostings, finishedInvertedIndexBuild, getEndOfPipeline |
Methods inherited from class org.terrier.indexing.Indexer |
---|
finishedDirectIndexBuild, index, init, load_field_ids, load_pipeline, merge, merge, mergeTwoIndices, parseInts, useFieldInformation |
Methods inherited from class java.lang.Object |
---|
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait |
Field Detail |
---|
protected org.apache.hadoop.mapred.JobConf jc
protected int splitnum
protected boolean start
protected org.apache.hadoop.mapred.OutputCollector<SplitEmittedTerm,MapEmittedPostingList> outputPostingListCollector
protected java.lang.String mapTaskID
protected int flushNo
protected java.io.DataOutputStream RunData
protected java.util.LinkedList<java.lang.Integer> flushList
protected org.apache.hadoop.mapred.Reporter currentReporter
protected LexiconOutputStream<java.lang.String> lexstream
protected HadoopRunIteratorFactory runIteratorF
protected boolean reduceStarted
protected boolean mutipleIndices
protected int reduceId
protected java.lang.String[] MapIndexPrefixes
protected org.apache.hadoop.mapred.Reporter lastReporter
Constructor Detail |
---|
public Hadoop_BasicSinglePassIndexer()
Method Detail |
---|
public static void main(java.lang.String[] args) throws java.lang.Exception
args
-
java.lang.Exception
public static void finish(java.lang.String destinationIndexPath, int numberOfReduceTasks, HadoopPlugin.JobFactory jf) throws java.lang.Exception
destinationIndexPath
- numberOfReduceTasks
- jf
-
java.lang.Exception
public void configure(org.apache.hadoop.mapred.JobConf _jc)
configure
in interface org.apache.hadoop.mapred.JobConfigurable
_jc
- The configuration for the jobpublic void close() throws java.io.IOException
close
in interface java.io.Closeable
java.io.IOException
protected void load_builder_boundary_documents()
Indexer
load_builder_boundary_documents
in class Indexer
protected void configureMap() throws java.lang.Exception
java.lang.Exception
protected MetaIndexBuilder createMetaIndexBuilder()
createMetaIndexBuilder
in class Indexer
protected void forceFlush() throws java.io.IOException
forceFlush
in class BasicSinglePassIndexer
java.io.IOException
public void map(org.apache.hadoop.io.Text key, SplitAwareWrapper<Document> value, org.apache.hadoop.mapred.OutputCollector<SplitEmittedTerm,MapEmittedPostingList> _outputPostingListCollector, org.apache.hadoop.mapred.Reporter reporter) throws java.io.IOException
map
in interface org.apache.hadoop.mapred.Mapper<org.apache.hadoop.io.Text,SplitAwareWrapper<Document>,SplitEmittedTerm,MapEmittedPostingList>
key
- - Wrapper for Document Numbervalue
- - Wrapper for Document Object_outputPostingListCollector
- Collector for emitting terms and postings lists
java.io.IOException
protected void indexEmpty(java.util.Map<java.lang.String,java.lang.String> docProperties) throws java.io.IOException
indexEmpty
in class Indexer
java.io.IOException
protected void closeMap() throws java.io.IOException
java.io.IOException
protected void configureReduce() throws java.lang.Exception
java.lang.Exception
protected java.util.LinkedList<MapData> loadRunData() throws java.io.IOException
java.io.IOException
public void startReduce(java.util.LinkedList<MapData> mapData) throws java.io.IOException
mapData
- - info about the runs(maps) and the flushes
java.io.IOException
public void reduce(SplitEmittedTerm Term, java.util.Iterator<MapEmittedPostingList> postingIterator, org.apache.hadoop.mapred.OutputCollector<java.lang.Object,java.lang.Object> output, org.apache.hadoop.mapred.Reporter reporter) throws java.io.IOException
reduce
in interface org.apache.hadoop.mapred.Reducer<SplitEmittedTerm,MapEmittedPostingList,java.lang.Object,java.lang.Object>
Term
- indexing term which we are reducing the posting lists intopostingIterator
- Iterator over the temporary posting lists we have for this termoutput
- Unused output collectorreporter
- Used to report progress
java.io.IOException
protected void mergeDocumentIndex(Index[] src) throws java.io.IOException
java.io.IOException
protected void closeReduce() throws java.io.IOException
java.io.IOException
protected RunsMerger createtheRunMerger()
|
||||||||||
PREV CLASS NEXT CLASS | FRAMES NO FRAMES | |||||||||
SUMMARY: NESTED | FIELD | CONSTR | METHOD | DETAIL: FIELD | CONSTR | METHOD |