Index: src/core/org/terrier/structures/indexing/singlepass/hadoop/Hadoop_BasicSinglePassIndexer.java =================================================================== --- src/core/org/terrier/structures/indexing/singlepass/hadoop/Hadoop_BasicSinglePassIndexer.java (revision 3953) +++ src/core/org/terrier/structures/indexing/singlepass/hadoop/Hadoop_BasicSinglePassIndexer.java (working copy) @@ -105,6 +105,12 @@ implements Mapper, SplitEmittedTerm, MapEmittedPostingList>, Reducer { + + /** TREC-388: disable per-flush compression of docids, as docid alignment problems + * can arise if map tasks are restarted. Be vary careful of changing this. + */ + static final boolean RESET_IDS_ON_FLUSH = false; + /** * main * @param args @@ -304,6 +310,7 @@ value="DM_GC", justification="Forcing GC is an essential part of releasing" + "memory for further indexing") + @Override /** causes the posting lists built up in memory to be flushed out */ protected void forceFlush() throws IOException { @@ -318,9 +325,10 @@ createMemoryPostings(); memoryCheck.reset(); numberOfDocsSinceFlush = 0; - currentId = 0; + if (RESET_IDS_ON_FLUSH) + currentId = 0; flushNo++; - } + } /** * Map processes a single document. Stores the terms in the document along with the posting list @@ -526,7 +534,7 @@ logger.info("Run data file "+ file.getPath().toString()+" has length "+Files.length(file.getPath().toString())); runDataIn = new DataInputStream(Files.openFileStream(file.getPath().toString())); tempHRD = new MapData(runDataIn); - //check to see if this file contaned our split information + //check to see if this file contained our split information if (mutipleIndices && partitionChecker.calculatePartition(tempHRD.getSplitnum(), jc.getNumReduceTasks()) != thisPartition) continue; @@ -605,7 +613,6 @@ logger.info("Merging document and meta indices"); final DocumentIndexBuilder docidOutput = new DocumentIndexBuilder(currentIndex, "document"); final MetaIndexBuilder metaBuilder = this.createMetaIndexBuilder(); - //int i_index = 0; int docCount =-1; TerrierTimer tt = new TerrierTimer("Merging document & meta indices", numdocs); tt.start(); @@ -624,7 +631,6 @@ } IndexUtil.close(docidInput); IndexUtil.close(metaInput1); - //i_index++; } } finally { tt.finished(); @@ -639,6 +645,13 @@ { currentIndex.addIndexStructure("document-factory", SimpleDocumentIndexEntry.Factory.class.getName(), "", ""); } + + //check document counts + if (docCount != numdocs) + { + logger.warn("Mismatch between expected ("+numdocs+") and found document counts ("+docCount+")"); + } + logger.info("Finished merging document indices from "+src.length+" map tasks: "+docCount +" documents found"); } @@ -687,6 +700,12 @@ currentIndex.setIndexProperty("num.Pointers",""+lexstream.getNumberOfPointersWritten() ); if (FieldScore.FIELDS_COUNT > 0) currentIndex.addIndexStructure("lexicon-valuefactory", FieldLexiconEntry.Factory.class.getName(), "java.lang.String", "${index.inverted.fields.count}"); + + if (lexstream.getNumberOfTermsWritten() == 0) + { + logger.warn("Lexicon wrote no terms, but reduceStarted = "+ reduceStarted); + } + this.finishedInvertedIndexBuild(); Index: src/core/org/terrier/structures/indexing/singlepass/hadoop/HadoopRunsMerger.java =================================================================== --- src/core/org/terrier/structures/indexing/singlepass/hadoop/HadoopRunsMerger.java (revision 3953) +++ src/core/org/terrier/structures/indexing/singlepass/hadoop/HadoopRunsMerger.java (working copy) @@ -153,18 +153,22 @@ if (correctHRD == null) throw new IOException("Did not find map data for split "+ splitNo); - // Add the FlushShift int currentFlushDocs=0; - ListIterator LI = correctHRD.getFlushDocSizes().listIterator(0); - //System.out.println("Runs Flush number : "+run.getRunNo()+", Size of HRD :"+correctHRD.getFlushDocSizes().size()); - int currentFlush =0; - while (currentFlush LI = correctHRD.getFlushDocSizes().listIterator(0); + //System.out.println("Runs Flush number : "+run.getRunNo()+", Size of HRD :"+correctHRD.getFlushDocSizes().size()); + int currentFlush =0; + while (currentFlush