public class Hadoop_BasicSinglePassIndexer extends BasicSinglePassIndexer implements org.apache.hadoop.mapred.Mapper<org.apache.hadoop.io.Text,SplitAwareWrapper<Document>,SplitEmittedTerm,MapEmittedPostingList>, org.apache.hadoop.mapred.Reducer<SplitEmittedTerm,MapEmittedPostingList,Object,Object>
BasicIndexer.BasicTermProcessor, BasicIndexer.FieldTermProcessor
Modifier and Type | Field and Description |
---|---|
protected org.apache.hadoop.mapred.Reporter |
currentReporter |
protected LinkedList<Integer> |
flushList
List of how many documents are in each flush we have made
|
protected int |
flushNo
How many flushes have we made
|
protected org.apache.hadoop.mapred.JobConf |
jc
JobConf of the current running job
|
protected org.apache.hadoop.mapred.Reporter |
lastReporter |
protected LexiconOutputStream<String> |
lexstream
OutputStream for the Lexicon
|
protected String[] |
MapIndexPrefixes |
protected String |
mapTaskID
Current map number
|
protected boolean |
mutipleIndices |
protected org.apache.hadoop.mapred.OutputCollector<SplitEmittedTerm,MapEmittedPostingList> |
outputPostingListCollector
output collector for the current map indexing process
|
protected int |
reduceId |
protected boolean |
reduceStarted
records whether the reduce() has been called for the first time
|
protected DataOutputStream |
RunData
OutputStream for the the data on the runs (runNo, flushes etc)
|
protected HadoopRunIteratorFactory |
runIteratorF
runIterator factory being used to generate RunIterators
|
protected int |
splitnum
The split that these documents came form
|
protected boolean |
start |
basicInvertedIndexPostingIteratorClass, currentFile, currentId, docsPerCheck, fieldInvertedIndexPostingIteratorClass, fileNames, invertedIndexClass, invertedIndexInputStreamClass, maxDocsPerFlush, maxMemory, memoryAfterFlush, memoryCheck, merger, mp, numberOfDocsSinceCheck, numberOfDocsSinceFlush, numberOfDocuments, numberOfPointers, numberOfTokens, numberOfUniqueTerms, runtime
compressionDirectConfig, compressionInvertedConfig, numOfTokensInDocument, termFields, termsInDocument
BUILDER_BOUNDARY_DOCUMENTS, currentIndex, directIndexBuilder, docIndexBuilder, emptyDocIndexEntry, fieldNames, fileNameNoExtension, IndexEmptyDocuments, invertedIndexBuilder, lexiconBuilder, logger, MAX_DOCS_PER_BUILDER, MAX_TOKENS_IN_DOCUMENT, metaBuilder, numFields, path, pipeline_first, prefix, useFieldInformation
Constructor and Description |
---|
Hadoop_BasicSinglePassIndexer()
Empty constructor
|
Modifier and Type | Method and Description |
---|---|
void |
close()
Called when the Map or Reduce task ends, to finish up the indexer.
|
protected void |
closeMap()
Finish up the map processing.
|
protected void |
closeReduce()
finishes the reduce step, by closing the lexicon and inverted file output,
building the lexicon hash and index, and merging the document indices created
by the map tasks.
|
void |
configure(org.apache.hadoop.mapred.JobConf _jc)
Configure this indexer.
|
protected void |
configureMap() |
protected void |
configureReduce() |
protected MetaIndexBuilder |
createMetaIndexBuilder() |
protected RunsMerger |
createtheRunMerger()
Creates the RunsMerger and the RunIteratorFactory
|
static void |
finish(String destinationIndexPath,
int numberOfReduceTasks,
HadoopPlugin.JobFactory jf)
finish
|
protected void |
forceFlush() |
protected void |
indexEmpty(Map<String,String> docProperties)
Write the empty document to the inverted index
|
protected void |
load_builder_boundary_documents()
Loads the builder boundary documents from the property indexing.builder.boundary.docnos, comma delimited.
|
protected LinkedList<MapData> |
loadRunData() |
static void |
main(String[] args)
main
|
void |
map(org.apache.hadoop.io.Text key,
SplitAwareWrapper<Document> value,
org.apache.hadoop.mapred.OutputCollector<SplitEmittedTerm,MapEmittedPostingList> _outputPostingListCollector,
org.apache.hadoop.mapred.Reporter reporter)
Map processes a single document.
|
protected void |
mergeDocumentIndex(Index[] src,
int numdocs)
Merges the simple document indexes made for each map, instead creating the final document index
|
void |
reduce(SplitEmittedTerm Term,
Iterator<MapEmittedPostingList> postingIterator,
org.apache.hadoop.mapred.OutputCollector<Object,Object> output,
org.apache.hadoop.mapred.Reporter reporter)
Main reduce algorithm step.
|
void |
startReduce(LinkedList<MapData> mapData)
Merge the postings for the current term, converts the document ID's in the
postings to be relative to one another using the run number, number of documents
covered in each run, the flush number for that run and the number of documents
flushed.
|
checkFlush, createDirectIndex, createFieldRunMerger, createInvertedIndex, createInvertedIndex, createMemoryPostings, createRunMerger, finishMemoryPosting, getFileNames, indexDocument, load_indexer_properties, performMultiWayMerge
createDocumentPostings, finishedInvertedIndexBuild, getEndOfPipeline
finishedDirectIndexBuild, index, init, load_field_ids, load_pipeline, merge, merge, mergeTwoIndices, parseInts, useFieldInformation
protected org.apache.hadoop.mapred.JobConf jc
protected int splitnum
protected boolean start
protected org.apache.hadoop.mapred.OutputCollector<SplitEmittedTerm,MapEmittedPostingList> outputPostingListCollector
protected String mapTaskID
protected int flushNo
protected DataOutputStream RunData
protected LinkedList<Integer> flushList
protected org.apache.hadoop.mapred.Reporter currentReporter
protected LexiconOutputStream<String> lexstream
protected HadoopRunIteratorFactory runIteratorF
protected boolean reduceStarted
protected boolean mutipleIndices
protected int reduceId
protected String[] MapIndexPrefixes
protected org.apache.hadoop.mapred.Reporter lastReporter
public Hadoop_BasicSinglePassIndexer()
public static void main(String[] args) throws Exception
args
- Exception
public static void finish(String destinationIndexPath, int numberOfReduceTasks, HadoopPlugin.JobFactory jf) throws Exception
destinationIndexPath
- numberOfReduceTasks
- jf
- Exception
public void configure(org.apache.hadoop.mapred.JobConf _jc)
configure
in interface org.apache.hadoop.mapred.JobConfigurable
_jc
- The configuration for the jobpublic void close() throws IOException
close
in interface Closeable
close
in interface AutoCloseable
IOException
protected void load_builder_boundary_documents()
Indexer
load_builder_boundary_documents
in class Indexer
protected MetaIndexBuilder createMetaIndexBuilder()
createMetaIndexBuilder
in class Indexer
protected void forceFlush() throws IOException
forceFlush
in class BasicSinglePassIndexer
IOException
public void map(org.apache.hadoop.io.Text key, SplitAwareWrapper<Document> value, org.apache.hadoop.mapred.OutputCollector<SplitEmittedTerm,MapEmittedPostingList> _outputPostingListCollector, org.apache.hadoop.mapred.Reporter reporter) throws IOException
map
in interface org.apache.hadoop.mapred.Mapper<org.apache.hadoop.io.Text,SplitAwareWrapper<Document>,SplitEmittedTerm,MapEmittedPostingList>
key
- - Wrapper for Document Numbervalue
- - Wrapper for Document Object_outputPostingListCollector
- Collector for emitting terms and postings listsIOException
protected void indexEmpty(Map<String,String> docProperties) throws IOException
indexEmpty
in class Indexer
IOException
protected void closeMap() throws IOException
IOException
protected LinkedList<MapData> loadRunData() throws IOException
IOException
public void startReduce(LinkedList<MapData> mapData) throws IOException
mapData
- - info about the runs(maps) and the flushesIOException
public void reduce(SplitEmittedTerm Term, Iterator<MapEmittedPostingList> postingIterator, org.apache.hadoop.mapred.OutputCollector<Object,Object> output, org.apache.hadoop.mapred.Reporter reporter) throws IOException
reduce
in interface org.apache.hadoop.mapred.Reducer<SplitEmittedTerm,MapEmittedPostingList,Object,Object>
Term
- indexing term which we are reducing the posting lists intopostingIterator
- Iterator over the temporary posting lists we have for this termoutput
- Unused output collectorreporter
- Used to report progressIOException
protected void mergeDocumentIndex(Index[] src, int numdocs) throws IOException
IOException
protected void closeReduce() throws IOException
IOException
protected RunsMerger createtheRunMerger()
Terrier Information Retrieval Platform4.1. Copyright © 2004-2015, University of Glasgow