Index: src/core/org/terrier/indexing/hadoop/Hadoop_BasicSinglePassIndexer.java =================================================================== --- src/core/org/terrier/indexing/hadoop/Hadoop_BasicSinglePassIndexer.java (revision 3776) +++ src/core/org/terrier/indexing/hadoop/Hadoop_BasicSinglePassIndexer.java (working copy) @@ -112,6 +112,9 @@ implements Mapper, SplitEmittedTerm, MapEmittedPostingList>, Reducer { + + public static final boolean RESET_IDS_ON_FLUSH = false; + /** * main * @param args @@ -311,6 +314,7 @@ value="DM_GC", justification="Forcing GC is an essential part of releasing" + "memory for further indexing") + @Override /** causes the posting lists built up in memory to be flushed out */ protected void forceFlush() throws IOException { @@ -325,9 +329,10 @@ createMemoryPostings(); memoryCheck.reset(); numberOfDocsSinceFlush = 0; - currentId = 0; + if (RESET_IDS_ON_FLUSH) + currentId = 0; flushNo++; - } + } /** * Map processes a single document. Stores the terms in the document along with the posting list Index: src/core/org/terrier/structures/indexing/singlepass/hadoop/HadoopRunsMerger.java =================================================================== --- src/core/org/terrier/structures/indexing/singlepass/hadoop/HadoopRunsMerger.java (revision 3776) +++ src/core/org/terrier/structures/indexing/singlepass/hadoop/HadoopRunsMerger.java (working copy) @@ -31,6 +31,7 @@ import java.util.LinkedList; import java.util.ListIterator; +import org.terrier.indexing.hadoop.Hadoop_BasicSinglePassIndexer; import org.terrier.structures.BasicLexiconEntry; import org.terrier.structures.LexiconEntry; import org.terrier.structures.LexiconOutputStream; @@ -94,7 +95,7 @@ byte startBitOffset = this.getBitOffset(); LexiconEntry le = null; // for each run in the list - int counter = 0; + //int counter = 0; //for one term: for each set of postings for that term while (run.hasNext()) { PostingInRun posting = run.next(); @@ -113,7 +114,7 @@ posting.addToLexiconEntry(le); lastFreq += posting.getTF(); lastDocFreq += posting.getDf(); - counter++; + //counter++; } le.setTermId(currentTerm++); ((BasicLexiconEntry)le).setOffset(startOffset, startBitOffset); @@ -155,18 +156,22 @@ if (correctHRD == null) throw new IOException("Did not find map data for split "+ splitNo); - // Add the FlushShift int currentFlushDocs=0; - ListIterator LI = correctHRD.getFlushDocSizes().listIterator(0); - //System.out.println("Runs Flush number : "+run.getRunNo()+", Size of HRD :"+correctHRD.getFlushDocSizes().size()); - int currentFlush =0; - while (currentFlush LI = correctHRD.getFlushDocSizes().listIterator(0); + //System.out.println("Runs Flush number : "+run.getRunNo()+", Size of HRD :"+correctHRD.getFlushDocSizes().size()); + int currentFlush =0; + while (currentFlush