diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/TrecTerrier.java src/TrecTerrier.java --- ../../../CURRENT_HEAD/terrier/src/TrecTerrier.java 2009-01-29 15:39:55.000000000 +0000 +++ src/TrecTerrier.java 2009-03-03 14:34:49.000000000 +0000 @@ -26,6 +26,7 @@ import java.io.File; import org.apache.log4j.Logger; + import uk.ac.gla.terrier.applications.HadoopIndexing; import uk.ac.gla.terrier.applications.TRECIndexing; import uk.ac.gla.terrier.applications.TRECLMIndexing; @@ -37,10 +38,9 @@ import uk.ac.gla.terrier.evaluation.NamedPageEvaluation; import uk.ac.gla.terrier.structures.DirectIndexInputStream; import uk.ac.gla.terrier.structures.DocumentIndexInputStream; -import uk.ac.gla.terrier.structures.InvertedIndexInputStream; -import uk.ac.gla.terrier.structures.InvertedIndex; import uk.ac.gla.terrier.structures.Index; -import uk.ac.gla.terrier.structures.LexiconInputStream; +import uk.ac.gla.terrier.structures.InvertedIndexInputStream; +import uk.ac.gla.terrier.structures.LexiconUtil; import uk.ac.gla.terrier.utility.ApplicationSetup; import uk.ac.gla.terrier.utility.Files; /** @@ -330,12 +330,6 @@ if (printdocid && !Files.exists(ApplicationSetup.DOCUMENT_INDEX_FILENAME)) return ERROR_PRINT_DOCINDEX_FILE_NOT_EXISTS; - if (printlexicon && !Files.exists(ApplicationSetup.LEXICON_FILENAME)) - return ERROR_PRINT_LEXICON_FILE_NOT_EXISTS; - - if (printinverted && !Files.exists(ApplicationSetup.INVERTED_FILENAME)) - return ERROR_PRINT_INVERTED_FILE_NOT_EXISTS; - if (printdirect && !Files.exists(ApplicationSetup.DIRECT_FILENAME)) return ERROR_PRINT_DIRECT_FILE_NOT_EXISTS; @@ -425,33 +419,31 @@ i.close(); } else if (printlexicon) { Index i = Index.createIndex(); - LexiconInputStream lex = (LexiconInputStream)(i.getIndexStructureInputStream("lexicon")); - lex.print(); - lex.close(); - i.close(); + LexiconUtil.printLexicon(i, "lexicon"); } else if (printdirect) { Index i = Index.createIndex(); - DirectIndexInputStream dirIndex = (DirectIndexInputStream)(i.getIndexStructureInputStream("direct")); - dirIndex.print(); - dirIndex.close(); - i.close(); + if (! i.hasIndexStructureInputStream("direct")) + { + logger.warn("Sorry, no direct index structure in index"); + } + else + { + DirectIndexInputStream dirIndex = (DirectIndexInputStream)(i.getIndexStructureInputStream("direct")); + dirIndex.print(); + dirIndex.close(); + i.close(); + } } else if (printinverted) { Index i = Index.createIndex(); - if (i.hasIndexStructureInputStream("inverted"))//some dont yet have appropriate input stream implementations + if (i.hasIndexStructureInputStream("inverted")) { InvertedIndexInputStream invIndex = (InvertedIndexInputStream)(i.getIndexStructureInputStream("inverted")); invIndex.print(); invIndex.close(); } - else if (i.hasIndexStructure("inverted")) - { - InvertedIndex invIndex = (InvertedIndex)i.getIndexStructure("inverted"); - invIndex.print(); - invIndex.close(); - } else { - logger.warn("Sorry, no inverted index structure in index"); + logger.warn("Sorry, no inverted index inputstream structure in index"); } i.close(); } else if (printstats) { @@ -527,12 +519,6 @@ case ERROR_PRINT_DOCINDEX_FILE_NOT_EXISTS : System.err.println("The specified document index file does not exist."); break; - case ERROR_PRINT_LEXICON_FILE_NOT_EXISTS : - System.err.println("The specified lexicon file ("+ApplicationSetup.LEXICON_FILENAME+") does not exist."); - break; - case ERROR_PRINT_INVERTED_FILE_NOT_EXISTS : - System.err.println("The specified inverted index does not exist."); - break; case ERROR_PRINT_DIRECT_FILE_NOT_EXISTS : System.err.println("The specified direct index does not exist."); break; @@ -571,7 +557,6 @@ protected static final int ERROR_NO_C_VALUE = 2; protected static final int ERROR_CONFLICTING_ARGUMENTS = 3; protected static final int ERROR_DIRECT_FILE_EXISTS = 4; - protected static final int ERROR_INVERTED_FILE_EXISTS = 5; protected static final int ERROR_DIRECT_FILE_NOT_EXISTS = 6; protected static final int ERROR_PRINT_DOCINDEX_FILE_NOT_EXISTS = 7; protected static final int ERROR_PRINT_LEXICON_FILE_NOT_EXISTS = 8; diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/overview.html src/overview.html --- ../../../CURRENT_HEAD/terrier/src/overview.html 2005-01-17 17:05:58.000000000 +0000 +++ src/overview.html 1970-01-01 01:00:00.000000000 +0100 @@ -1,36 +0,0 @@ - - - -Terrier Information Retrieval Platform - - - -

Terrier is a modular platform for the rapid development of -large-scale Information Retrieval applications, providing -indexing and retrieval functionalities. Terrier is based on -the Divergence from Randomness (DFR) framework. It can index -various document collections, including the standard TREC -collections, such as AP, WSJ, WT10G, .GOV and .GOV2. It also -provides a wide range of parameter-free weighting approaches -and full-text search algorithms, aiming to offer a public -testbed for performing Information Retrieval experiments.

- - diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/package.html src/package.html --- ../../../CURRENT_HEAD/terrier/src/package.html 2009-01-28 20:16:43.000000000 +0000 +++ src/package.html 1970-01-01 01:00:00.000000000 +0100 @@ -1,29 +0,0 @@ - - - -Default package - - - -

Provides application-level code that use the Terrier platform to -perform indexing and retrieval from either standard test collections

- - diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/applications/TRECLMIndexing.java src/uk/ac/gla/terrier/applications/TRECLMIndexing.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/applications/TRECLMIndexing.java 2009-01-29 15:39:56.000000000 +0000 +++ src/uk/ac/gla/terrier/applications/TRECLMIndexing.java 2009-03-03 14:34:49.000000000 +0000 @@ -69,11 +69,15 @@ return; } - CreateTermEstimateIndex teIndex = new CreateTermEstimateIndex(index, modelName); - teIndex.createTermEstimateIndex(); - - CreateDocumentInitialWeightIndex docWIndex = new CreateDocumentInitialWeightIndex(index, modelName); - docWIndex.createDocumentInitialWeightIndex(); + try{ + CreateTermEstimateIndex teIndex = new CreateTermEstimateIndex(index, modelName); + teIndex.createTermEstimateIndex(); + + CreateDocumentInitialWeightIndex docWIndex = new CreateDocumentInitialWeightIndex(index, modelName); + docWIndex.createDocumentInitialWeightIndex(); + } catch (Exception e) { + logger.error("Could not make LM structures", e); + } } /** diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/compression/BitFile.java src/uk/ac/gla/terrier/compression/BitFile.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/compression/BitFile.java 2009-01-28 20:16:45.000000000 +0000 +++ src/uk/ac/gla/terrier/compression/BitFile.java 2009-03-03 14:34:49.000000000 +0000 @@ -1,638 +1,644 @@ - -/* - * Terrier - Terabyte Retriever - * Webpage: http://ir.dcs.gla.ac.uk/terrier - * Contact: terrier{a.}dcs.gla.ac.uk - * University of Glasgow - Department of Computing Science - * http://www.gla.ac.uk/ - * - * The contents of this file are subject to the Mozilla Public License - * Version 1.1 (the "License"); you may not use this file except in - * compliance with the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. - * - * The Original Code is BitFile.java. - * - * The Original Code is Copyright (C) 2004-2009 the University of Glasgow. - * All Rights Reserved. - * - * Contributor(s): - * Roi Blanco (original author) - */ -package uk.ac.gla.terrier.compression; - -import java.io.File; -import java.io.IOException; - -import org.apache.log4j.Logger; -import uk.ac.gla.terrier.utility.Files; -import uk.ac.gla.terrier.utility.io.RandomDataInput; -import uk.ac.gla.terrier.utility.io.RandomDataOutput; - -/** - * This class encapsulates a random access file and provides - * the functionalities to write binary encoded, unary encoded and gamma encoded - * integers greater than zero, as well as specifying their offset in the file. It - * is employed by the DirectFile and the InvertedFile classes. - * Use the getBit/ByteOffset methods only for writting, and not for reading. - * This class contains the methods in both BitInputStream and BitOutputStream. - * The numbers are written into a byte starting from the most significant bit (i.e, left to right). - * The sequence of method calls to write a sequence of gamma encoded - * and unary encoded numbers is:
- * file.writeReset();
- * long startByte1 = file.getByteOffset();
- * byte startBit1 = file.getBitOffset();
- * file.writeGamma(20000);
- * file.writeUnary(2);
- * file.writeGamma(35000);
- * file.writeUnary(1);
- * file.writeGamma(3);
- * file.writeUnary(2);
- * long endByte1 = file.getByteOffset();
- * byte endBit1 = file.getBitOffset();
- * if (endBit1 == 0 && endByte1 > 0) {
- * endBit1 = 7;
- * endByte1--;
- * }
- * while for reading a sequence of numbers the sequence of calls is:
- * file.readReset((long) startByte1, (byte) startBit1, (long) endByte1, (byte) endBit1);
- * int gamma = file.readGamma();
- * int unary = file.readUnary();
- * - * @author Roi Blanco - * - */ - -public class BitFile implements BitInSeekable, BitIn, BitOut { - /** The logger used */ - protected static Logger logger = Logger.getRootLogger(); - /** Write buffer */ - protected byte[] buffer; - /** Pointer for the buffer */ - protected int bufferPointer; - /** Size of the buffer (it has to be 4 * k) */ - protected int bufferSize; - /** Default size */ - protected static final int DEFAULT_SIZE = 16 * 1024 ; - /** Default file mode access for a BitFile object. - * Currently "rw". */ - protected static final String DEFAULT_FILE_MODE = "rw"; - /** The byte offset.*/ - protected long byteOffset; - /** The current byte offset to be read */ - protected int readByteOffset; - /** The bit offset.*/ - protected int bitOffset; - /** A int to write to the stream. */ - protected int byteToWrite; - /** Indicates if we are writting or reading */ - protected boolean isWriteMode = false; - /** The underlying file */ - protected RandomDataInput file; - /** Same object as file, but cast to RandomDataOutput */ - protected RandomDataOutput writeFile = null; - /** Buffer for reads */ - protected byte[] inBuffer; - /** Number of bits read so far */ - protected int readBits = 0; - /** - * Initialises the variables, used internally - * - */ - protected void init(){ - byteOffset = 0; - bitOffset = 32; - byteToWrite = 0; - buffer = new byte[DEFAULT_SIZE]; - bufferSize = DEFAULT_SIZE; - - } - - /** - * Constructs an instance of the class for a given file and an acces method to the file - * @param _file File to read/write - * @param access String indicating the access permissions of the file - * @throws IOException if an I/O error occurs - */ - public BitFile(File _file, String access) { - try { - this.file = (access.indexOf("w") != -1) - ? Files.writeFileRandom(_file) - : Files.openFileRandom(_file); - init(); - } catch (IOException ioe) { - logger.error("Input/Output exception while creating BitFile object.", ioe); - } - - } - - /** - * Constructs an instance of the class for a given filename and an acces method to the file - * @param filename java.lang.String the name of the underlying file - * @param access String indicating the access permissions of the file - * @throws IOException if an I/O error occurs - */ - public BitFile(String filename, String access) { - try { - this.file = (access.indexOf("w") != -1) - ? Files.writeFileRandom(filename) - : Files.openFileRandom(filename); - init(); - } catch (IOException ioe) { - logger.error("Input/Output exception while creating BitFile object.", ioe); - } - } - - - /** - * Constructs an instance of the class for a given filename, "rw" permissions - * @param filename java.lang.String the name of the underlying file - * @throws IOException if an I/O error occurs - */ - public BitFile(String filename){ - this(filename, DEFAULT_FILE_MODE); - } - - public BitFile(File file) { - this(file, DEFAULT_FILE_MODE); - } - - /** do nothing constructor */ - protected BitFile() {} - - - /** - * Returns the byte offset of the stream. - * It corresponds to the position of the - * byte in which the next bit will be written. - * Use only when writting - * @return the byte offset in the stream. - */ - public long getByteOffset() { - return this.isWriteMode - ? byteOffset * 4 + ((32 - bitOffset) / 8) - : readByteOffset; - } - /** - * Returns the bit offset in the last byte. - * It corresponds to the position in which - * the next bit will be written. - * Use only when writting. - * @return the bit offset in the stream. - */ - public byte getBitOffset() { - //System.out.println("bitOffset="+bitOffset + " calculated="+((32 - bitOffset) % 8) ); - return this.isWriteMode - ? (byte)((32 - bitOffset) % 8) - : (byte)bitOffset; - /*: (byte)( 8-(( (32 - bitOffset) % 8)%7) )*/ - } - - /** - * Flushes the int currently being written into the buffer, and if it is necessary, - * it flush the buffer to the underlying OutputStream - * @param writeMe int to be written into the buffer - * @throws IOException if an I/O error occurs - */ - protected void writeIntBuffer(int writeMe) throws IOException{ - // at least there is one empty gap - buffer[bufferPointer++] = (byte)(writeMe >>> 24); - buffer[bufferPointer++] = (byte)(writeMe >>> 16); - buffer[bufferPointer++] = (byte)(writeMe >>> 8); - buffer[bufferPointer++] = (byte)writeMe; - byteOffset++; - if(bufferPointer == bufferSize){ - writeFile.write(buffer,0,bufferPointer); - bufferPointer = 0; - } - } - - /** - * Writes a number in the current byte we are using. - * @param b the number to write - * @param len the length of the number in bits - * @return the number of bits written - * @throws IOException if an I/O error occurs. - */ - protected int writeInCurrent( final int b, final int len ) throws IOException{ - if(len > 0){ - byteToWrite |= b << (bitOffset-=len); - if ( bitOffset == 0 ) { - writeIntBuffer(byteToWrite); - bitOffset = 32; - byteToWrite = 0; - } - } - return len; - } - - /** - * Writes an integer x using unary encoding. The encoding is a sequence of x -1 zeros and 1 one: - * 1, 01, 001, 0001, etc .. - * This method is not failsafe, it doesn't check if the argument is 0 or negative. - * @param x the number to write - * @return the number of bis written - * @throws IOException if an I/O error occurs. - */ - public int writeUnary( int x ) throws IOException{ - if(bitOffset >= x) return writeInCurrent(1, x); - final int shift = bitOffset; - x -= shift; - writeIntBuffer(byteToWrite); - bitOffset = 32; - byteToWrite = 0; - int i = x -1 >> 5; - while( i-- != 0 ) writeIntBuffer( 0 ); - writeInCurrent( 1, ( (x-1) & 31) + 1 ); - return x + shift ; - } - - /** - * Writes an integer x into the stream using gamma encoding. - * This method is not failsafe, it doesn't check if the argument is 0 or negative. - * @param x the int number to write - * @return the number of bits written - * @throws IOException if an I/O error occurs. - */ - public int writeGamma( int x ) throws IOException { - final int msb = BitUtilities.mostSignificantBit( x ) ; - final int l = writeUnary( msb + 1 ); - return l + ( writeInt( x , msb ) ); - } - - /** - * Writes an integer x into the underlying OutputStream. First, it checks if it fits into the current - * byte we are using for writting, and then it writes as many bytes as necessary - * @param x the int to write - * @param len length of the int in bits - * @return the number of bits written - * @throws IOException if an I/O error occurs. - */ - public int writeInt( int x, final int len ) throws IOException { - if ( bitOffset >= len ) return writeInCurrent( x, len ); - final int queue = ( len - bitOffset ) & 31; - writeInCurrent( x >> queue, bitOffset ); - writeInCurrent( x , queue); - return len; - } - - - /** - * Flushes the OuputStream - * (empty method) - */ - public void writeFlush(){} - - /** - * Reads from the file a specific number of bytes and after this - * call, a sequence of read calls may follow. The offsets given - * as arguments are inclusive. For example, if we call this method - * with arguments 0, 2, 1, 7, it will read in a buffer the contents - * of the underlying file from the third bit of the first byte to the - * last bit of the second byte. - * @param startByteOffset the starting byte to read from - * @param startBitOffset the bit offset in the starting byte - * @param endByteOffset the ending byte - * @param endBitOffset the bit offset in the ending byte. - * This bit is the last bit of this entry. - * @return Returns the BitIn object to use to read that data - */ - public BitIn readReset(long startByteOffset, byte startBitOffset, long endByteOffset, byte endBitOffset) { - try { - this.isWriteMode = false; - file.seek(startByteOffset); - inBuffer = new byte[(int)(endByteOffset - startByteOffset + 2)]; - file.readFully(inBuffer); - readByteOffset = 0; - bitOffset = startBitOffset; - } catch(IOException ioe) { - logger.error("Input/Output exception while reading from a random access file. Stack trace follows", ioe); - } - return this; - } - - /** - * Reads a gamma encoded integer from the underlying stream - * @return the number read - * @throws IOException if an I/O error occurs - */ - public int readGamma() { - int u = readUnary() - 1; - return (1 << u) + readBinary(u) ; - } - - /** - * Reads a unary encoded integer from the underlying stream - * @return the number read - * @throws IOException if an I/O error occurs - */ - public int readUnary() { - int x; - final int leftA = (inBuffer[readByteOffset] << bitOffset) & 0x00FF; - if(leftA != 0){ - x = 8 - BitUtilities.MSB_BYTES[ leftA ]; - bitOffset += x ; - readIn(); - return x; - } - x = 8 - bitOffset; - readByteOffset++; - while( (inBuffer[readByteOffset]== 0 )) { - x += 8; - readByteOffset++; - } - x += (bitOffset = 8 - BitUtilities.MSB_BYTES[ inBuffer[readByteOffset] & 0x00FF] ); - readIn(); - return x; - } - - /** - * Reads a new byte from the InputStream if we have finished with the current one. - * @throws IOException if we have reached the end of the file - */ - protected void readIn(){ - if(bitOffset == 8){ - bitOffset = 0; - readByteOffset++; - } - } - - - /** - * Aligns the stream to the next byte - * @throws IOException if an I/O error occurs - */ - public void align() { - if ( ( bitOffset & 7 ) == 0 ) return; - bitOffset = 0; - readByteOffset++; - } - - /** - * Reads a binary integer from the already read buffer. - * @param len is the number of binary bits to read - * @throws IOException if an I/O error occurs - * @return the decoded integer - */ - public int readBinary(int len) { - if(8 - bitOffset > len){ - int b = ( ((inBuffer[readByteOffset] << bitOffset) & 0x00FF)) >>> (8-len) ; - bitOffset += len; - return b; - } - - int x = inBuffer[readByteOffset] & ( ~ (0xFF << (8-bitOffset) )) &0xFF; - len += bitOffset - 8; - int i = len >> 3; - while(i-- != 0){ - readByteOffset++; - x = x << 8 | (inBuffer[readByteOffset] & 0xFF); - } - readByteOffset++; - bitOffset = len & 7; - return (x << bitOffset) | ((inBuffer[readByteOffset] & 0xFF) >>> (8-bitOffset)) ; - } - - /** Skip a number of bits in the current input stream - * @param len The number of bits to skip - */ - public void skipBits(int len) - { - if(8 - bitOffset > len){ - bitOffset += len; - return; - } - len += bitOffset - 8; - final int i = len >> 3; - if (i > 0) - { - readByteOffset+= i; - } - readByteOffset++; - bitOffset = len & 7; - } - - /** - * Closes the file. If the file has been written, it is also flushed to disk. - * @throws IOException if an I/O error occurs. - */ - - public void close(){ - try{ - if(isWriteMode){ - writeIntBufferToBit(byteToWrite,bitOffset); - writeFile.write(buffer,0,bufferPointer); - } - file.close(); - }catch(IOException ioe){ - logger.error("Input/Output exception while closing BitFile object.", ioe); - - } - } - - - /** - * Writes the current integer used into the buffer, taking into account the number of bits written. - * Used when closing the file, to avoid unecessary byte writes. - * in that integer so far. - * @param writeMe int to write - * @param bitOffset number of bits written so far in the int - */ - protected void writeIntBufferToBit(int writeMe, int bitOffset){ - if(bitOffset < 32 ) buffer[bufferPointer++] = (byte)(writeMe >>> 24); - if(bitOffset < 24 ) buffer[bufferPointer++] = (byte)(writeMe >>> 16); - if(bitOffset < 16 ) buffer[bufferPointer++] = (byte)(writeMe >>> 8); - if(bitOffset < 8 ) buffer[bufferPointer++] = (byte)(writeMe); - byteOffset++; - } - - /** - * Set the write mode to true - * - */ - public void writeReset() throws IOException { - if (!( file instanceof RandomDataOutput)) - throw new IOException("Cannot write to read only BitFile file"); - writeFile = (RandomDataOutput)file; - this.isWriteMode = true; - } - - /** - * Writes an integer in binary format to the stream. - * @param len size in bits of the number. - * @param x the integer to write. - * @return the number of bits written. - * @throws IOException if an I/O error occurs. - */ - public int writeBinary(int len, int x) throws IOException{ - return writeInt(x,len); - } - - /** - * Writes an integer x using minimal binary encoding, given an upper bound. - * This method is not failsafe, it doesn't check if the argument is 0 or negative. - * @param x the number to write - * @param b and strict bound for x - * @return the number of bits written - * @throws IOException if an I/O error occurs. - */ - public int writeMinimalBinary( final int x, final int b ) throws IOException { - - final int log2b = BitUtilities.mostSignificantBit(b); - // Numbers smaller than m are encoded in log2b bits. - final int m = ( 1 << log2b + 1 ) - b; - - if ( x < m ) - return writeInt( x, log2b ); - else - return writeInt( m + x, log2b + 1 ); - } - - /** - * Reads a binary encoded integer, given an upper bound - * @param b the upper bound - * @return the int read - * @throws IOException if an I/O error occurs - */ - public int readMinimalBinary( final int b ) throws IOException { - final int log2b = BitUtilities.mostSignificantBit(b); - final int m = ( 1 << log2b + 1 ) - b; - final int x = readBinary( log2b ); - if ( x < m ) return x + 1; - else { int temp = ( x << 1 ) + readBinary(1) ; - return temp; - } - } - - /** - * Writes and integer x into the stream using golomb coding. - * This method is not failsafe, it doesn't check if the argument or the modulus is 0 or negative. - * @param x the number to write - * @param b the parameter for golomb coding - * @return the number of bits written - * @throws IOException if and I/O error occurs - */ - public int writeGolomb( final int x, final int b ) throws IOException { - final int q = (x - 1) / b; - final int l = writeUnary( q + 1 ); - return l + writeMinimalBinary( x - q*b - 1, b ); - } - - /** - * Reads a Golomb encoded integer - * @param b the golomb modulus - * @return the int read - * @throws IOException if and I/O error occurs - */ - public int readGolomb( final int b) throws IOException { - final int q = (readUnary() - 1 ) * b; - return q + readMinimalBinary( b ) + 1; - } - - /** - * Writes and integer x into the stream using skewed-golomb coding. - * Consider a bucket-vector v = <b, 2b, 4b, ... , 2^i b, ...> - * an integer x is coded as unary(k+1) where k is the index - * sum(i=0)(k) v_i < x <= sum(i=0)(k+1)
, so k = log(x/b + 1) - * sum_i = b(2^n -1) (geometric progression) - * and the remainder with log(v_k) bits in binary - * if lower = ceil(x/b) -> lower = 2^i * b -> i = log(ceil(x/b)) + 1 - * the remainder x - sum_i 2^i*b - 1 = x - b(2^n - 1) - 1 is coded with floor(log(v_k)) bits - * - * This method is not failsafe, it doesn't check if the argument or the modulus is 0 or negative. - * @param x the number to write - * @param b the parameter for golomb coding - * @return the number of bits written - * @throws IOException if and I/O error occurs - */ - public int writeSkewedGolomb( final int x, final int b ) throws IOException { - final int i = BitUtilities.mostSignificantBit( x / b + 1 ); - final int l = writeUnary( i + 1 ); - final int M = ( ( 1 << i + 1 ) - 1 ) * b; - final int m = ( M / ( 2 * b ) ) * b; - - return l + writeMinimalBinary( x - m , M - m ); - } - - /** Writes a sequence of integers using interpolative coding. The data must be sorted (increasing order). - * - * @param data the vector containing the integer sequence. - * @param offset the offset into data where the sequence starts. - * @param len the number of integers to code. - * @param lo a lower bound (must be smaller than or equal to the first integer in the sequence). - * @param hi an upper bound (must be greater than or equal to the last integer in the sequence). - * @return the number of written bits. - * @throws IOException if an I/O error occurs. - */ - public int writeInterpolativeCode( int data[], int offset, int len, int lo, int hi ) throws IOException { - final int h, m; - int l; - - if ( len == 0 ) return 0; - if ( len == 1 ) return writeMinimalBinary( data[offset] - lo , hi - lo ); - h = len / 2; - m = data[ offset + h ]; - l = writeMinimalBinary( m - ( lo + h) , hi - len + h + 1 - ( lo + h ) ); - l += writeInterpolativeCode( data, offset, h, lo, m - 1 ); - return l + writeInterpolativeCode( data, offset + h + 1, len - h - 1, m + 1, hi ); - } - - /** - * Reads a skewed-golomb encoded integer from the underlying stream - * Consider a bucket-vector v = <0, 2b, 4b, ... , 2^i b, ...> - * The sum of the elements in the vector goes - * b, 3b, 7b, 2^(i-1)*b - * - * @return the number read - * @throws IOException if an I/O error occurs - */ - public int readSkewedGolomb( final int b ) throws IOException { - - final int M = ( ( 1 << readUnary() ) - 1 ) * b; - final int m = ( M / ( 2 * b ) ) * b; - return m + readMinimalBinary( M - m ) ; - } - - /** - * Reads a sequence of numbers from the stream interpolative coded. - * @param data the result vector - * @param offset offset where to write in the vector - * @param len the number of integers to decode. - * @param lo a lower bound (the same one passed to writeInterpolativeCoding) - * @param hi an upper bound (the same one passed to writeInterpolativeCoding) - * @throws IOException if an I/O error occurs - */ - public void readInterpolativeCoding( int data[], int offset, int len, int lo, int hi ) throws IOException { - final int h, m; - - if ( len == 0 ) return; - if ( len == 1 ) { - data[ offset ] = readMinimalBinaryZero( hi - lo ) + lo ; - return; - } - - h = len / 2; - m = readMinimalBinaryZero( hi - len + h - ( lo + h ) + 1 ) + lo + h ; - data[ offset + h ] = m ; - - readInterpolativeCoding( data, offset, h, lo, m - 1 ); - readInterpolativeCoding( data, offset + h + 1, len - h - 1, m + 1, hi ); - } - - /** - * Reads a minimal binary encoded number, when the upper bound can b zero. - * Used to interpolative code - * @param b the upper bound - * @return the int read - * @throws IOException if an I/O error occurs - */ - public int readMinimalBinaryZero(int b) throws IOException{ - if(b > 0 ) return readMinimalBinary(b); - else return 0; - } -} + +/* + * Terrier - Terabyte Retriever + * Webpage: http://ir.dcs.gla.ac.uk/terrier + * Contact: terrier{a.}dcs.gla.ac.uk + * University of Glasgow - Department of Computing Science + * http://www.gla.ac.uk/ + * + * The contents of this file are subject to the Mozilla Public License + * Version 1.1 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See + * the License for the specific language governing rights and limitations + * under the License. + * + * The Original Code is BitFile.java. + * + * The Original Code is Copyright (C) 2004-2009 the University of Glasgow. + * All Rights Reserved. + * + * Contributor(s): + * Roi Blanco (original author) + */ +package uk.ac.gla.terrier.compression; + +import java.io.File; +import java.io.IOException; + +import org.apache.log4j.Logger; +import uk.ac.gla.terrier.utility.Files; +import uk.ac.gla.terrier.utility.io.RandomDataInput; +import uk.ac.gla.terrier.utility.io.RandomDataOutput; + +/** + * This class encapsulates a random access file and provides + * the functionalities to write binary encoded, unary encoded and gamma encoded + * integers greater than zero, as well as specifying their offset in the file. It + * is employed by the DirectFile and the InvertedFile classes. + * Use the getBit/ByteOffset methods only for writting, and not for reading. + * This class contains the methods in both BitInputStream and BitOutputStream. + * The numbers are written into a byte starting from the most significant bit (i.e, left to right). + * The sequence of method calls to write a sequence of gamma encoded + * and unary encoded numbers is:
+ * file.writeReset();
+ * long startByte1 = file.getByteOffset();
+ * byte startBit1 = file.getBitOffset();
+ * file.writeGamma(20000);
+ * file.writeUnary(2);
+ * file.writeGamma(35000);
+ * file.writeUnary(1);
+ * file.writeGamma(3);
+ * file.writeUnary(2);
+ * long endByte1 = file.getByteOffset();
+ * byte endBit1 = file.getBitOffset();
+ * if (endBit1 == 0 && endByte1 > 0) {
+ * endBit1 = 7;
+ * endByte1--;
+ * }
+ * while for reading a sequence of numbers the sequence of calls is:
+ * file.readReset((long) startByte1, (byte) startBit1, (long) endByte1, (byte) endBit1);
+ * int gamma = file.readGamma();
+ * int unary = file.readUnary();
+ * + * @author Roi Blanco + * + */ + +public class BitFile implements BitInSeekable, BitIn, BitOut { + /** The logger used */ + protected static Logger logger = Logger.getRootLogger(); + /** Write buffer */ + protected byte[] buffer; + /** Pointer for the buffer */ + protected int bufferPointer; + /** Size of the buffer (it has to be 4 * k) */ + protected int bufferSize; + /** Default size */ + protected static final int DEFAULT_SIZE = 16 * 1024 ; + /** Default file mode access for a BitFile object. + * Currently "rw". */ + protected static final String DEFAULT_FILE_MODE = "rw"; + /** The byte offset.*/ + protected long byteOffset; + /** The current byte offset to be read */ + protected int readByteOffset; + /** The bit offset.*/ + protected int bitOffset; + /** A int to write to the stream. */ + protected int byteToWrite; + /** Indicates if we are writting or reading */ + protected boolean isWriteMode = false; + /** The underlying file */ + protected RandomDataInput file; + /** Same object as file, but cast to RandomDataOutput */ + protected RandomDataOutput writeFile = null; + /** Buffer for reads */ + protected byte[] inBuffer; + /** Number of bits read so far */ + protected int readBits = 0; + /** + * Initialises the variables, used internally + * + */ + protected void init(){ + byteOffset = 0; + bitOffset = 32; + byteToWrite = 0; + buffer = new byte[DEFAULT_SIZE]; + bufferSize = DEFAULT_SIZE; + + } + + /** + * Constructs an instance of the class for a given file and an acces method to the file + * @param _file File to read/write + * @param access String indicating the access permissions of the file + * @throws IOException if an I/O error occurs + */ + public BitFile(File _file, String access) { + try { + this.file = (access.indexOf("w") != -1) + ? Files.writeFileRandom(_file) + : Files.openFileRandom(_file); + init(); + } catch (IOException ioe) { + logger.error("Input/Output exception while creating BitFile object.", ioe); + } + + } + + /** + * Constructs an instance of the class for a given filename and an acces method to the file + * @param filename java.lang.String the name of the underlying file + * @param access String indicating the access permissions of the file + * @throws IOException if an I/O error occurs + */ + public BitFile(String filename, String access) { + try { + this.file = (access.indexOf("w") != -1) + ? Files.writeFileRandom(filename) + : Files.openFileRandom(filename); + init(); + } catch (IOException ioe) { + logger.error("Input/Output exception while creating BitFile object.", ioe); + } + } + + + /** + * Constructs an instance of the class for a given filename, "rw" permissions + * @param filename java.lang.String the name of the underlying file + * @throws IOException if an I/O error occurs + */ + public BitFile(String filename){ + this(filename, DEFAULT_FILE_MODE); + } + + public BitFile(File file) { + this(file, DEFAULT_FILE_MODE); + } + + /** do nothing constructor */ + protected BitFile() {} + + + /** + * Returns the byte offset of the stream. + * It corresponds to the position of the + * byte in which the next bit will be written. + * Use only when writting + * @return the byte offset in the stream. + */ + public long getByteOffset() { + return this.isWriteMode + ? byteOffset * 4 + ((32 - bitOffset) / 8) + : readByteOffset; + } + /** + * Returns the bit offset in the last byte. + * It corresponds to the position in which + * the next bit will be written. + * Use only when writting. + * @return the bit offset in the stream. + */ + public byte getBitOffset() { + //System.out.println("bitOffset="+bitOffset + " calculated="+((32 - bitOffset) % 8) ); + return this.isWriteMode + ? (byte)((32 - bitOffset) % 8) + : (byte)bitOffset; + /*: (byte)( 8-(( (32 - bitOffset) % 8)%7) )*/ + } + + /** + * Flushes the int currently being written into the buffer, and if it is necessary, + * it flush the buffer to the underlying OutputStream + * @param writeMe int to be written into the buffer + * @throws IOException if an I/O error occurs + */ + protected void writeIntBuffer(int writeMe) throws IOException{ + // at least there is one empty gap + buffer[bufferPointer++] = (byte)(writeMe >>> 24); + buffer[bufferPointer++] = (byte)(writeMe >>> 16); + buffer[bufferPointer++] = (byte)(writeMe >>> 8); + buffer[bufferPointer++] = (byte)writeMe; + byteOffset++; + if(bufferPointer == bufferSize){ + writeFile.write(buffer,0,bufferPointer); + bufferPointer = 0; + } + } + + /** + * Writes a number in the current byte we are using. + * @param b the number to write + * @param len the length of the number in bits + * @return the number of bits written + * @throws IOException if an I/O error occurs. + */ + protected int writeInCurrent( final int b, final int len ) throws IOException{ + if(len > 0){ + byteToWrite |= b << (bitOffset-=len); + if ( bitOffset == 0 ) { + writeIntBuffer(byteToWrite); + bitOffset = 32; + byteToWrite = 0; + } + } + return len; + } + + /** + * Writes an integer x using unary encoding. The encoding is a sequence of x -1 zeros and 1 one: + * 1, 01, 001, 0001, etc .. + * This method is not failsafe, it doesn't check if the argument is 0 or negative. + * @param x the number to write + * @return the number of bis written + * @throws IOException if an I/O error occurs. + */ + public int writeUnary( int x ) throws IOException{ + if(bitOffset >= x) return writeInCurrent(1, x); + final int shift = bitOffset; + x -= shift; + writeIntBuffer(byteToWrite); + bitOffset = 32; + byteToWrite = 0; + int i = x -1 >> 5; + while( i-- != 0 ) writeIntBuffer( 0 ); + writeInCurrent( 1, ( (x-1) & 31) + 1 ); + return x + shift ; + } + + /** + * Writes an integer x into the stream using gamma encoding. + * This method is not failsafe, it doesn't check if the argument is 0 or negative. + * @param x the int number to write + * @return the number of bits written + * @throws IOException if an I/O error occurs. + */ + public int writeGamma( int x ) throws IOException { + final int msb = BitUtilities.mostSignificantBit( x ) ; + final int l = writeUnary( msb + 1 ); + return l + ( writeInt( x , msb ) ); + } + + /** + * Writes an integer x into the underlying OutputStream. First, it checks if it fits into the current + * byte we are using for writting, and then it writes as many bytes as necessary + * @param x the int to write + * @param len length of the int in bits + * @return the number of bits written + * @throws IOException if an I/O error occurs. + */ + public int writeInt( int x, final int len ) throws IOException { + if ( bitOffset >= len ) return writeInCurrent( x, len ); + final int queue = ( len - bitOffset ) & 31; + writeInCurrent( x >> queue, bitOffset ); + writeInCurrent( x , queue); + return len; + } + + + /** + * Flushes the OuputStream + * (empty method) + */ + public void writeFlush(){} + + /** + * Reads from the file a specific number of bytes and after this + * call, a sequence of read calls may follow. The offsets given + * as arguments are inclusive. For example, if we call this method + * with arguments 0, 2, 1, 7, it will read in a buffer the contents + * of the underlying file from the third bit of the first byte to the + * last bit of the second byte. + * @param startByteOffset the starting byte to read from + * @param startBitOffset the bit offset in the starting byte + * @param endByteOffset the ending byte + * @param endBitOffset the bit offset in the ending byte. + * This bit is the last bit of this entry. + * @return Returns the BitIn object to use to read that data + */ + public BitIn readReset(long startByteOffset, byte startBitOffset, long endByteOffset, byte endBitOffset) { + try { + this.isWriteMode = false; + file.seek(startByteOffset); + inBuffer = new byte[(int)(endByteOffset - startByteOffset + 2)]; + file.readFully(inBuffer); + readByteOffset = 0; + bitOffset = startBitOffset; + } catch(IOException ioe) { + logger.error("Input/Output exception while reading from a random access file. Stack trace follows", ioe); + } + return this; + } + + + public BitIn readReset(long startByteOffset, byte startBitOffset) throws IOException { + throw new IOException("Unsupported"); + } + + /** + * Reads a gamma encoded integer from the underlying stream + * @return the number read + * @throws IOException if an I/O error occurs + */ + public int readGamma() { + int u = readUnary() - 1; + return (1 << u) + readBinary(u) ; + } + + /** + * Reads a unary encoded integer from the underlying stream + * @return the number read + * @throws IOException if an I/O error occurs + */ + public int readUnary() { + int x; + final int leftA = (inBuffer[readByteOffset] << bitOffset) & 0x00FF; + if(leftA != 0){ + x = 8 - BitUtilities.MSB_BYTES[ leftA ]; + bitOffset += x ; + readIn(); + return x; + } + x = 8 - bitOffset; + readByteOffset++; + while( (inBuffer[readByteOffset]== 0 )) { + x += 8; + readByteOffset++; + } + x += (bitOffset = 8 - BitUtilities.MSB_BYTES[ inBuffer[readByteOffset] & 0x00FF] ); + readIn(); + return x; + } + + /** + * Reads a new byte from the InputStream if we have finished with the current one. + * @throws IOException if we have reached the end of the file + */ + protected void readIn(){ + if(bitOffset == 8){ + bitOffset = 0; + readByteOffset++; + } + } + + + /** + * Aligns the stream to the next byte + * @throws IOException if an I/O error occurs + */ + public void align() { + if ( ( bitOffset & 7 ) == 0 ) return; + bitOffset = 0; + readByteOffset++; + } + + /** + * Reads a binary integer from the already read buffer. + * @param len is the number of binary bits to read + * @throws IOException if an I/O error occurs + * @return the decoded integer + */ + public int readBinary(int len) { + if(8 - bitOffset > len){ + int b = ( ((inBuffer[readByteOffset] << bitOffset) & 0x00FF)) >>> (8-len) ; + bitOffset += len; + return b; + } + + int x = inBuffer[readByteOffset] & ( ~ (0xFF << (8-bitOffset) )) &0xFF; + len += bitOffset - 8; + int i = len >> 3; + while(i-- != 0){ + readByteOffset++; + x = x << 8 | (inBuffer[readByteOffset] & 0xFF); + } + readByteOffset++; + bitOffset = len & 7; + return (x << bitOffset) | ((inBuffer[readByteOffset] & 0xFF) >>> (8-bitOffset)) ; + } + + /** Skip a number of bits in the current input stream + * @param len The number of bits to skip + */ + public void skipBits(int len) + { + if(8 - bitOffset > len){ + bitOffset += len; + return; + } + len += bitOffset - 8; + final int i = len >> 3; + if (i > 0) + { + readByteOffset+= i; + } + readByteOffset++; + bitOffset = len & 7; + } + + /** + * Closes the file. If the file has been written, it is also flushed to disk. + * @throws IOException if an I/O error occurs. + */ + + public void close(){ + try{ + if(isWriteMode){ + writeIntBufferToBit(byteToWrite,bitOffset); + writeFile.write(buffer,0,bufferPointer); + } + file.close(); + }catch(IOException ioe){ + logger.error("Input/Output exception while closing BitFile object.", ioe); + + } + } + + + /** + * Writes the current integer used into the buffer, taking into account the number of bits written. + * Used when closing the file, to avoid unecessary byte writes. + * in that integer so far. + * @param writeMe int to write + * @param bitOffset number of bits written so far in the int + */ + protected void writeIntBufferToBit(int writeMe, int bitOffset){ + if(bitOffset < 32 ) buffer[bufferPointer++] = (byte)(writeMe >>> 24); + if(bitOffset < 24 ) buffer[bufferPointer++] = (byte)(writeMe >>> 16); + if(bitOffset < 16 ) buffer[bufferPointer++] = (byte)(writeMe >>> 8); + if(bitOffset < 8 ) buffer[bufferPointer++] = (byte)(writeMe); + byteOffset++; + } + + /** + * Set the write mode to true + * + */ + public void writeReset() throws IOException { + if (!( file instanceof RandomDataOutput)) + throw new IOException("Cannot write to read only BitFile file"); + writeFile = (RandomDataOutput)file; + this.isWriteMode = true; + } + + /** + * Writes an integer in binary format to the stream. + * @param len size in bits of the number. + * @param x the integer to write. + * @return the number of bits written. + * @throws IOException if an I/O error occurs. + */ + public int writeBinary(int len, int x) throws IOException{ + return writeInt(x,len); + } + + /** + * Writes an integer x using minimal binary encoding, given an upper bound. + * This method is not failsafe, it doesn't check if the argument is 0 or negative. + * @param x the number to write + * @param b and strict bound for x + * @return the number of bits written + * @throws IOException if an I/O error occurs. + */ + public int writeMinimalBinary( final int x, final int b ) throws IOException { + + final int log2b = BitUtilities.mostSignificantBit(b); + // Numbers smaller than m are encoded in log2b bits. + final int m = ( 1 << log2b + 1 ) - b; + + if ( x < m ) + return writeInt( x, log2b ); + else + return writeInt( m + x, log2b + 1 ); + } + + /** + * Reads a binary encoded integer, given an upper bound + * @param b the upper bound + * @return the int read + * @throws IOException if an I/O error occurs + */ + public int readMinimalBinary( final int b ) throws IOException { + final int log2b = BitUtilities.mostSignificantBit(b); + final int m = ( 1 << log2b + 1 ) - b; + final int x = readBinary( log2b ); + if ( x < m ) return x + 1; + else { int temp = ( x << 1 ) + readBinary(1) ; + return temp; + } + } + + /** + * Writes and integer x into the stream using golomb coding. + * This method is not failsafe, it doesn't check if the argument or the modulus is 0 or negative. + * @param x the number to write + * @param b the parameter for golomb coding + * @return the number of bits written + * @throws IOException if and I/O error occurs + */ + public int writeGolomb( final int x, final int b ) throws IOException { + final int q = (x - 1) / b; + final int l = writeUnary( q + 1 ); + return l + writeMinimalBinary( x - q*b - 1, b ); + } + + /** + * Reads a Golomb encoded integer + * @param b the golomb modulus + * @return the int read + * @throws IOException if and I/O error occurs + */ + public int readGolomb( final int b) throws IOException { + final int q = (readUnary() - 1 ) * b; + return q + readMinimalBinary( b ) + 1; + } + + /** + * Writes and integer x into the stream using skewed-golomb coding. + * Consider a bucket-vector v = <b, 2b, 4b, ... , 2^i b, ...> + * an integer x is coded as unary(k+1) where k is the index + * sum(i=0)(k) v_i < x <= sum(i=0)(k+1)
, so k = log(x/b + 1) + * sum_i = b(2^n -1) (geometric progression) + * and the remainder with log(v_k) bits in binary + * if lower = ceil(x/b) -> lower = 2^i * b -> i = log(ceil(x/b)) + 1 + * the remainder x - sum_i 2^i*b - 1 = x - b(2^n - 1) - 1 is coded with floor(log(v_k)) bits + * + * This method is not failsafe, it doesn't check if the argument or the modulus is 0 or negative. + * @param x the number to write + * @param b the parameter for golomb coding + * @return the number of bits written + * @throws IOException if and I/O error occurs + */ + public int writeSkewedGolomb( final int x, final int b ) throws IOException { + final int i = BitUtilities.mostSignificantBit( x / b + 1 ); + final int l = writeUnary( i + 1 ); + final int M = ( ( 1 << i + 1 ) - 1 ) * b; + final int m = ( M / ( 2 * b ) ) * b; + + return l + writeMinimalBinary( x - m , M - m ); + } + + /** Writes a sequence of integers using interpolative coding. The data must be sorted (increasing order). + * + * @param data the vector containing the integer sequence. + * @param offset the offset into data where the sequence starts. + * @param len the number of integers to code. + * @param lo a lower bound (must be smaller than or equal to the first integer in the sequence). + * @param hi an upper bound (must be greater than or equal to the last integer in the sequence). + * @return the number of written bits. + * @throws IOException if an I/O error occurs. + */ + public int writeInterpolativeCode( int data[], int offset, int len, int lo, int hi ) throws IOException { + final int h, m; + int l; + + if ( len == 0 ) return 0; + if ( len == 1 ) return writeMinimalBinary( data[offset] - lo , hi - lo ); + h = len / 2; + m = data[ offset + h ]; + l = writeMinimalBinary( m - ( lo + h) , hi - len + h + 1 - ( lo + h ) ); + l += writeInterpolativeCode( data, offset, h, lo, m - 1 ); + return l + writeInterpolativeCode( data, offset + h + 1, len - h - 1, m + 1, hi ); + } + + /** + * Reads a skewed-golomb encoded integer from the underlying stream + * Consider a bucket-vector v = <0, 2b, 4b, ... , 2^i b, ...> + * The sum of the elements in the vector goes + * b, 3b, 7b, 2^(i-1)*b + * + * @return the number read + * @throws IOException if an I/O error occurs + */ + public int readSkewedGolomb( final int b ) throws IOException { + + final int M = ( ( 1 << readUnary() ) - 1 ) * b; + final int m = ( M / ( 2 * b ) ) * b; + return m + readMinimalBinary( M - m ) ; + } + + /** + * Reads a sequence of numbers from the stream interpolative coded. + * @param data the result vector + * @param offset offset where to write in the vector + * @param len the number of integers to decode. + * @param lo a lower bound (the same one passed to writeInterpolativeCoding) + * @param hi an upper bound (the same one passed to writeInterpolativeCoding) + * @throws IOException if an I/O error occurs + */ + public void readInterpolativeCoding( int data[], int offset, int len, int lo, int hi ) throws IOException { + final int h, m; + + if ( len == 0 ) return; + if ( len == 1 ) { + data[ offset ] = readMinimalBinaryZero( hi - lo ) + lo ; + return; + } + + h = len / 2; + m = readMinimalBinaryZero( hi - len + h - ( lo + h ) + 1 ) + lo + h ; + data[ offset + h ] = m ; + + readInterpolativeCoding( data, offset, h, lo, m - 1 ); + readInterpolativeCoding( data, offset + h + 1, len - h - 1, m + 1, hi ); + } + + /** + * Reads a minimal binary encoded number, when the upper bound can b zero. + * Used to interpolative code + * @param b the upper bound + * @return the int read + * @throws IOException if an I/O error occurs + */ + public int readMinimalBinaryZero(int b) throws IOException{ + if(b > 0 ) return readMinimalBinary(b); + else return 0; + } + +} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/compression/BitFileBuffered.java src/uk/ac/gla/terrier/compression/BitFileBuffered.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/compression/BitFileBuffered.java 1970-01-01 01:00:00.000000000 +0100 +++ src/uk/ac/gla/terrier/compression/BitFileBuffered.java 2009-03-03 14:34:49.000000000 +0000 @@ -0,0 +1,341 @@ +package uk.ac.gla.terrier.compression; + +import java.io.EOFException; +import java.io.File; +import java.io.IOException; + +import org.apache.log4j.Logger; + +import uk.ac.gla.terrier.utility.Files; +import uk.ac.gla.terrier.utility.io.RandomDataInput; + + +/** Implementation of BitInSeekable/BitIn interfaces similar to BitFile. However this + * class buffers only a small area of the posting list, to minimise large memory + * allocations during retrieval. In contrast to BitFile, this class is read-only. + * @author Patrice Lacour + * @version $Revision: $ + */ +public class BitFileBuffered implements BitInSeekable { + /** how much of a file to buffer by default */ + protected static int DEFAULT_BUFFER_LENGTH = 8*1024; + /** The logger used */ + protected static Logger logger = Logger.getRootLogger(); + /** The underlying file */ + protected RandomDataInput file; + /** how much of this file we will buffer */ + protected final int buffer_size; + protected long fileSize; + + /** + * Constructs an instance of the class for a given filename, using the default buffer size + * @param file the underlying file + * @throws IOException if an I/O error occurs + */ + public BitFileBuffered(File file) { + this(file, DEFAULT_BUFFER_LENGTH); + } + + /** + * Constructs an instance of the class for a given filename. Default buffer size + * @param filename java.lang.String the name of the underlying file + * @throws IOException if an I/O error occurs + */ + public BitFileBuffered(String filename) { + this(filename, DEFAULT_BUFFER_LENGTH); + } + + /** + * Constructs an instance of the class for a given filename + * @param file the underlying file + * @param bufSize how much of the file to buffer + * @throws IOException if an I/O error occurs + */ + public BitFileBuffered(File file, int bufSize) { + buffer_size = bufSize; + try { + this.file = Files.openFileRandom(file); + } catch (IOException ioe) { + logger.error("Input/Output exception while creating BitFileBuffered object.", ioe); + } + } + + /** + * Constructs an instance of the class for a given filename + * @param filename java.lang.String the name of the underlying file + * @param bufSize how much of the file to buffer + * @throws IOException if an I/O error occurs + */ + public BitFileBuffered(String filename, int bufSize) { + buffer_size = bufSize; + try { + fileSize = Files.length(filename); + file = Files.openFileRandom(filename); + } catch (IOException ioe) { + logger.error("Input/Output exception while creating BitFileBuffered object.", ioe); + } + } + + + /** + * Reads from the file a specific number of bytes and after this + * call, a sequence of read calls may follow. The offsets given + * as arguments are inclusive. For example, if we call this method + * with arguments 0, 2, 1, 7, it will read in a buffer the contents + * of the underlying file from the third bit of the first byte to the + * last bit of the second byte. + * @param startByteOffset the starting byte to read from + * @param startBitOffset the bit offset in the starting byte + * @param endByteOffset the ending byte + * @param endBitOffset the bit offset in the ending byte. + * This bit is the last bit of this entry. + * @return Returns the BitIn object to use to read that data + */ + public BitIn readReset(long startByteOffset, byte startBitOffset, long endByteOffset, byte endBitOffset) { + final long range = endByteOffset - startByteOffset + (long)1; + return new BitInBuffered(file,startByteOffset,startBitOffset, range < buffer_size ? (int)range : buffer_size); + } + + + /** + * Reads from the file from a specific offset. After this + * call, a sequence of read calls may follow. + * @param startByteOffset the starting byte to read from + * @param startBitOffset the bit offset in the starting byte + */ + public BitIn readReset(long startByteOffset, byte startBitOffset) + { + final long actualBufferSize = (startByteOffset + buffer_size) > fileSize + ? (fileSize - startByteOffset) + : buffer_size; + return new BitInBuffered(file,startByteOffset,startBitOffset, (int)actualBufferSize); + } + + public void close() + { + try { + file.close(); + } catch(IOException ioe) { + logger.error("Input/Output exception while reading from a random access file. Stack trace follows", ioe); + } + + } + + + class BitInBuffered implements BitIn + { + private RandomDataInput parentFile; + private long offset; + private byte[] inBuffer; + private int size; + private int readByteOffset; + private int bitOffset; + + public BitInBuffered(RandomDataInput file, long startByteOffset, byte bitOffset, int bufLength) + { + try{ + this.offset = startByteOffset; + this.bitOffset= bitOffset; + this.parentFile = file; + this.size = bufLength; + parentFile.seek(startByteOffset); + inBuffer = new byte[size]; + parentFile.readFully(inBuffer); + readByteOffset = 0; + }catch(IOException ioe){ + logger.error("Input/Output exception while reading from a random access file. Stack trace follows", ioe); + } + } + + + /* algorithm in this class: + for every byte read + if we exceed current buffer + seek parentFile if needed + read (size) more from parentFile + end if + for a skip + if skip exceed current buffer + seek parent file to end of skip + read (size) more from parentFile + end if + */ + + + + private void incrByte() + { + try{ + readByteOffset++; + offset++; + if(readByteOffset == size) + { + + readByteOffset=0; + inBuffer = new byte[size]; + parentFile.seek(offset); + //logger.info("Reading 1024 bytes. pos="+parentFile.getFilePointer()); + try{ + parentFile.readFully(inBuffer); + } catch (EOFException eofe) { /* ignore this */} + } + }catch(IOException ioe){ + logger.error("Input/Output exception while reading from a random access file. Stack trace follows", ioe); + } + } + + + + private void incrByte(int i) + { + try{ + //System.out.println("skypping"); + offset += i; + readByteOffset+=i; + if( readByteOffset >= size ) // we go to the next block -- we skip only the begin of the block + { + parentFile.seek(offset); // we skip the first bytes of the next block + inBuffer = new byte[size]; + readByteOffset = 0; + //logger.info("Reading 1024 bytes. pos="+parentFile.getFilePointer()); + try{ + parentFile.readFully(inBuffer); + } catch (EOFException eofe) { /* ignore this */} + } + + }catch(IOException ioe){ + logger.error("Input/Output exception while reading from a random access file. Stack trace follows", ioe); + } + } + + /* + * Reads a gamma encoded integer from the underlying stream + * @return the number read + * @throws IOException if an I/O error occurs + */ + public int readGamma() { + int u = readUnary() - 1; + return (1 << u) + readBinary(u) ; + } + + + /** + * Reads a unary encoded integer from the underlying stream + * @return the number read + * @throws IOException if an I/O error occurs + */ + public int readUnary() { + int x; + final int leftA = (inBuffer[readByteOffset] << bitOffset) & 0x00FF; + if(leftA != 0){ + x = 8 - BitUtilities.MSB_BYTES[ leftA ]; + bitOffset += x ; + readIn(); + return x; + } + x = 8 - bitOffset; + incrByte(); + while( (inBuffer[readByteOffset]== 0 )) { + x += 8; + incrByte(); + } + x += (bitOffset = 8 - BitUtilities.MSB_BYTES[ inBuffer[readByteOffset] & 0x00FF] ); + readIn(); + return x; + } + + /** + * Reads a new byte from the InputStream if we have finished with the current one. + * @throws IOException if we have reached the end of the file + */ + protected void readIn(){ + if(bitOffset == 8){ + bitOffset = 0; + incrByte(); + } + } + + + /** + * Aligns the stream to the next byte + * @throws IOException if an I/O error occurs + */ + public void align(){ + if ( ( bitOffset & 7 ) == 0 ) return; + bitOffset = 0; + incrByte(); + } + + /** + * Reads a binary integer from the already read buffer. + * @param len is the number of binary bits to read + * @throws IOException if an I/O error occurs + * @return the decoded integer + */ + public int readBinary(int len) { + if(8 - bitOffset > len){ + int b = ( ((inBuffer[readByteOffset] << bitOffset) & 0x00FF)) >>> (8-len) ; + bitOffset += len; + return b; + } + + int x = inBuffer[readByteOffset] & ( ~ (0xFF << (8-bitOffset) )) &0xFF; + len += bitOffset - 8; + int i = len >> 3; + while(i-- != 0){ + incrByte(); + x = x << 8 | (inBuffer[readByteOffset] & 0xFF); + } + incrByte(); + bitOffset = len & 7; + return (x << bitOffset) | ((inBuffer[readByteOffset] & 0xFF) >>> (8-bitOffset)) ; + } + + /** Skip a number of bits in the current input stream + * @param len The number of bits to skip + */ + public void skipBits(int len) + { + if(8 - bitOffset > len){ + bitOffset += len; + return; + } + len += bitOffset - 8; + final int i = len >> 3; + if (i > 0) + { + incrByte(i); + } + incrByte(); + bitOffset = len & 7; + } + + public long getByteOffset(){ return offset;} + /** + * Returns the bit offset in the last byte. + * It corresponds to the position in which + * the next bit will be written. + * Use only when writting. + * @return the bit offset in the stream. + */ + public byte getBitOffset(){return (byte) bitOffset;} + + + /** + * Closes the file. If the file has been written, it is also flushed to disk. + * @throws IOException if an I/O error occurs. + */ + + public void close(){ + /*try{ + file.close(); + }catch(IOException ioe){ + logger.error("Input/Output exception while closing BitFile object.", ioe); + }*/ + } + + + + } +} + diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/compression/BitFileInMemory.java src/uk/ac/gla/terrier/compression/BitFileInMemory.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/compression/BitFileInMemory.java 2009-01-28 20:16:45.000000000 +0000 +++ src/uk/ac/gla/terrier/compression/BitFileInMemory.java 2009-03-03 14:34:49.000000000 +0000 @@ -94,6 +94,11 @@ { return new BitInReader(startByteOffset, startBitOffset, endByteOffset, endBitOffset); } + + public BitIn readReset(long startByteOffset, byte startBitOffset) + { + return new BitInReader(startByteOffset, startBitOffset); + } /** Close this object. Does nothing. */ public void close() @@ -107,12 +112,16 @@ protected int bitOffset; protected int readByteOffset; - public BitInReader(long startByteOffset, byte startBitOffset, long endByteOffset, byte endBitOffset) + public BitInReader(long startByteOffset, byte startBitOffset) { - readByteOffset = (int)startByteOffset; bitOffset = startBitOffset; } + + public BitInReader(long startByteOffset, byte startBitOffset, long endByteOffset, byte endBitOffset) + { + this(startByteOffset, startBitOffset); + } /** * Returns the byte offset of the stream. * It corresponds to the position of the diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/compression/BitInSeekable.java src/uk/ac/gla/terrier/compression/BitInSeekable.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/compression/BitInSeekable.java 2009-01-28 20:16:45.000000000 +0000 +++ src/uk/ac/gla/terrier/compression/BitInSeekable.java 2009-03-03 14:34:49.000000000 +0000 @@ -52,4 +52,17 @@ * @return Returns the BitIn object to use to read that data */ public BitIn readReset(long startByteOffset, byte startBitOffset, long endByteOffset, byte endBitOffset) throws IOException; + + /** + * Reads from the file a specific number of bytes and after this + * call, a sequence of read calls may follow. The offsets given + * as arguments are inclusive. For example, if we call this method + * with arguments 0, 2, 1, 7, it will read in a buffer the contents + * of the underlying file from the third bit of the first byte to the + * last bit of the second byte. + * @param startByteOffset the starting byte to read from + * @param startBitOffset the bit offset in the starting byte + * @return Returns the BitIn object to use to read that data + */ + public BitIn readReset(long startByteOffset, byte startBitOffset) throws IOException; } diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/indexing/BasicIndexer.java src/uk/ac/gla/terrier/indexing/BasicIndexer.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/indexing/BasicIndexer.java 2009-01-28 20:16:46.000000000 +0000 +++ src/uk/ac/gla/terrier/indexing/BasicIndexer.java 2009-03-03 14:34:49.000000000 +0000 @@ -35,10 +35,7 @@ import uk.ac.gla.terrier.structures.indexing.DocumentPostingList; import uk.ac.gla.terrier.structures.indexing.InvertedIndexBuilder; import uk.ac.gla.terrier.structures.indexing.LexiconBuilder; -import uk.ac.gla.terrier.structures.indexing.UTFInvertedIndexBuilder; -import uk.ac.gla.terrier.structures.indexing.UTFLexiconBuilder; import uk.ac.gla.terrier.terms.TermPipeline; -import uk.ac.gla.terrier.utility.ApplicationSetup; import uk.ac.gla.terrier.utility.FieldScore; import uk.ac.gla.terrier.utility.TermCodes; /** @@ -171,15 +168,7 @@ public void createDirectIndex(Collection[] collections) { currentIndex = Index.createNewIndex(path, prefix); - if (UTFIndexing) - { - lexiconBuilder = new UTFLexiconBuilder(currentIndex); - } - else - { - lexiconBuilder = new LexiconBuilder(currentIndex); - } - + lexiconBuilder = new LexiconBuilder(currentIndex, "lexicon"); directIndexBuilder = new DirectIndexBuilder(currentIndex); docIndexBuilder = new DocumentIndexBuilder(currentIndex); @@ -372,16 +361,8 @@ //generate the inverted index - if (UTFIndexing) - { - logger.info("Started building the UTF inverted index..."); - invertedIndexBuilder = new UTFInvertedIndexBuilder(currentIndex); - } - else - { - logger.info("Started building the inverted index..."); - invertedIndexBuilder = new InvertedIndexBuilder(currentIndex); - } + logger.info("Started building the inverted index..."); + invertedIndexBuilder = new InvertedIndexBuilder(currentIndex, "inverted"); invertedIndexBuilder.createInvertedIndex(); finishedInvertedIndexBuild(); @@ -421,13 +402,6 @@ /** Hook method, called when the inverted index is finished - ie the lexicon is finished */ protected void finishedInvertedIndexBuild() { - if (Boolean.parseBoolean(ApplicationSetup.getProperty("lexicon.use.hash","true"))) { - logger.debug("Building lexicon hash"); - try{ - LexiconBuilder.createLexiconHash(currentIndex); - } catch (IOException ioe) { - logger.warn("Problem creating (optional) Lexicon Hash", ioe); - } - } + LexiconBuilder.optimise(currentIndex, "lexicon"); } } diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/indexing/BasicSinglePassIndexer.java src/uk/ac/gla/terrier/indexing/BasicSinglePassIndexer.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/indexing/BasicSinglePassIndexer.java 2009-01-28 20:16:46.000000000 +0000 +++ src/uk/ac/gla/terrier/indexing/BasicSinglePassIndexer.java 2009-03-03 14:34:49.000000000 +0000 @@ -32,14 +32,12 @@ import java.util.LinkedList; import java.util.Queue; +import uk.ac.gla.terrier.structures.BasicLexiconEntry; import uk.ac.gla.terrier.structures.Index; -import uk.ac.gla.terrier.structures.LexiconInputStream; import uk.ac.gla.terrier.structures.LexiconOutputStream; -import uk.ac.gla.terrier.structures.UTFLexiconInputStream; -import uk.ac.gla.terrier.structures.UTFLexiconOutputStream; +import uk.ac.gla.terrier.structures.MapFileLexiconOutputStream; import uk.ac.gla.terrier.structures.indexing.DocumentIndexBuilder; import uk.ac.gla.terrier.structures.indexing.DocumentPostingList; -import uk.ac.gla.terrier.structures.indexing.LexiconBuilder; import uk.ac.gla.terrier.structures.indexing.singlepass.FieldPostingInRun; import uk.ac.gla.terrier.structures.indexing.singlepass.FieldsMemoryPostings; import uk.ac.gla.terrier.structures.indexing.singlepass.FileRunIteratorFactory; @@ -228,15 +226,19 @@ try{ mp.finish(finishMemoryPosting()); }catch(Exception e){ - e.printStackTrace(); + logger.error("Problem creating index", e); } endCollection = System.currentTimeMillis(); long partialTime = (endCollection-startCollection)/1000; logger.info("Collection #"+collectionNo+ " took "+partialTime+ " seconds to build the runs for "+numberOfDocuments+" documents\n"); logger.info("Merging "+fileNames.size()+" runs..."); startCollection = System.currentTimeMillis(); - performMultiWayMerge(); - docIndexBuilder.finishedCollections(); + try{ + performMultiWayMerge(); + docIndexBuilder.finishedCollections(); + } catch (Exception e) { + logger.error("Problem finishing index", e); + } endCollection = System.currentTimeMillis(); logger.info("Collection #"+collectionNo+" took "+((endCollection-startCollection)/1000)+" seconds to merge\n "); logger.info("Collection #"+collectionNo+" total time "+( (endCollection-startCollection)/1000+partialTime)); @@ -305,15 +307,15 @@ * in a set of previously written runs. * The file names and the number of runs are given by the private queue */ - public void performMultiWayMerge(){ + public void performMultiWayMerge() throws IOException { String[][] fileNames = getFileNames(); - LexiconOutputStream lexStream = createLexiconOutputStream(path, prefix); + LexiconOutputStream lexStream = new MapFileLexiconOutputStream(this.currentIndex, "lexicon", BasicLexiconEntry.Factory.class); try{ if (useFieldInformation) createFieldRunMerger(fileNames); else createRunMerger(fileNames); - merger.beginMerge(fileNames.length, path + ApplicationSetup.FILE_SEPARATOR + prefix + ApplicationSetup.IFSUFFIX); + merger.beginMerge(fileNames.length, path + ApplicationSetup.FILE_SEPARATOR + prefix + ".inverted.bf"); while(!merger.isDone()){ merger.mergeOne(lexStream); } @@ -330,17 +332,16 @@ currentIndex.setIndexProperty("num.Terms", ""+numberOfUniqueTerms); currentIndex.setIndexProperty("num.Pointers", ""+numberOfPointers); currentIndex.setIndexProperty("num.Tokens", ""+numberOfTokens); - createLexicon(numberOfUniqueTerms); currentIndex.addIndexStructure( - "inverted", - invertedIndexClass, - "uk.ac.gla.terrier.structures.Lexicon,java.lang.String,java.lang.String", - "lexicon,path,prefix"); + "inverted", + invertedIndexClass, + "uk.ac.gla.terrier.structures.Index,java.lang.String", + "index,structureName"); currentIndex.addIndexStructureInputStream( - "inverted", - invertedIndexInputStreamClass, - "java.lang.String,java.lang.String,uk.ac.gla.terrier.structures.LexiconInputStream", - "path,prefix,lexicon-inputstream"); + "inverted", + "uk.ac.gla.terrier.structures.InvertedIndexInputStream", + "uk.ac.gla.terrier.structures.Index,java.lang.String,java.util.Iterator", + "index,structureName,lexicon-inputstream"); currentIndex.setIndexProperty("num.inverted.fields.bits", ""+FieldScore.FIELDS_COUNT ); }catch(Exception e){ logger.error("Problem in performMultiWayMerge", e); @@ -359,36 +360,6 @@ return files; } - /** - * Hook method that creates the right LexiconBuilder instance - * @throws IOException - */ - protected void createLexicon(int numberOfEntries) throws IOException{ - final LexiconInputStream lis = createLexiconInputStream(path, prefix); - LexiconBuilder.createLexiconIndex(lis, numberOfEntries, lis.getEntrySize(), path, prefix ); - currentIndex.addIndexStructure( - "lexicon", - UTFIndexing ? "uk.ac.gla.terrier.structures.UTFLexicon" :"uk.ac.gla.terrier.structures.Lexicon" ); - currentIndex.addIndexStructureInputStream( - "lexicon", - UTFIndexing ? "uk.ac.gla.terrier.structures.UTFLexiconInputStream" :"uk.ac.gla.terrier.structures.LexiconInputStream"); - } - - /** - * Hook method that creates the rigth LexiconOutputStream instance. - * @param name filename for the lexicon file. - */ - protected LexiconOutputStream createLexiconOutputStream(String path, String prefix){ - return UTFIndexing ? new UTFLexiconOutputStream(path, prefix) : new LexiconOutputStream(path, prefix); - } - - /** - * Hook method that creates the rigth LexiconOutputStream instance. - * @param name filename for the lexicon file. - */ - protected LexiconInputStream createLexiconInputStream(String path, String prefix){ - return UTFIndexing ? new UTFLexiconInputStream(path, prefix) : new LexiconInputStream(path, prefix); - } /** * Hook method that creates a FieldRunMerger instance diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/indexing/BlockIndexer.java src/uk/ac/gla/terrier/indexing/BlockIndexer.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/indexing/BlockIndexer.java 2009-01-28 20:16:46.000000000 +0000 +++ src/uk/ac/gla/terrier/indexing/BlockIndexer.java 2009-03-03 14:34:49.000000000 +0000 @@ -26,9 +26,11 @@ * Rodrygo Santo */ package uk.ac.gla.terrier.indexing; +import gnu.trove.THashSet; + import java.io.IOException; import java.util.Set; -import gnu.trove.THashSet; + import uk.ac.gla.terrier.structures.FilePosition; import uk.ac.gla.terrier.structures.Index; import uk.ac.gla.terrier.structures.indexing.BlockDirectIndexBuilder; @@ -38,8 +40,6 @@ import uk.ac.gla.terrier.structures.indexing.DocumentIndexBuilder; import uk.ac.gla.terrier.structures.indexing.DocumentPostingList; import uk.ac.gla.terrier.structures.indexing.LexiconBuilder; -import uk.ac.gla.terrier.structures.indexing.UTFBlockInvertedIndexBuilder; -import uk.ac.gla.terrier.structures.indexing.UTFBlockLexiconBuilder; import uk.ac.gla.terrier.terms.TermPipeline; import uk.ac.gla.terrier.utility.ApplicationSetup; import uk.ac.gla.terrier.utility.FieldScore; @@ -281,14 +281,7 @@ (Boolean.parseBoolean(ApplicationSetup.getProperty("block.delimiters.enabled", "false")) ? " delimited-block indexing enabled" : "")); currentIndex = Index.createNewIndex(path, prefix); - if (UTFIndexing) - { - lexiconBuilder = new UTFBlockLexiconBuilder(currentIndex); - } - else - { - lexiconBuilder = new BlockLexiconBuilder(currentIndex); - } + lexiconBuilder = new BlockLexiconBuilder(currentIndex, "lexicon"); directIndexBuilder = new BlockDirectIndexBuilder(currentIndex); docIndexBuilder = new DocumentIndexBuilder(currentIndex); //int LexiconCount = 0; @@ -412,16 +405,8 @@ return; } - if (UTFIndexing) - { - logger.info("Started building the utf block inverted index..."); - invertedIndexBuilder = new UTFBlockInvertedIndexBuilder(currentIndex); - } - else - { - logger.info("Started building the block inverted index..."); - invertedIndexBuilder = new BlockInvertedIndexBuilder(currentIndex); - } + logger.info("Started building the block inverted index..."); + invertedIndexBuilder = new BlockInvertedIndexBuilder(currentIndex, "inverted"); invertedIndexBuilder.createInvertedIndex(); this.finishedInvertedIndexBuild(); currentIndex.flush(); @@ -459,14 +444,7 @@ /** Hook method, called when the inverted index is finished - ie the lexicon is finished */ protected void finishedInvertedIndexBuild() { - if (Boolean.parseBoolean(ApplicationSetup.getProperty("lexicon.use.hash","true"))) { - logger.debug("Building lexicon hash"); - try{ - LexiconBuilder.createLexiconHash(currentIndex); - } catch (IOException ioe) { - logger.warn("Problem creating (optional) Lexicon Hash", ioe); - } - } + LexiconBuilder.optimise(currentIndex, "lexicon"); } diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/indexing/CreateDocumentInitialWeightIndex.java src/uk/ac/gla/terrier/indexing/CreateDocumentInitialWeightIndex.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/indexing/CreateDocumentInitialWeightIndex.java 2009-01-28 20:16:46.000000000 +0000 +++ src/uk/ac/gla/terrier/indexing/CreateDocumentInitialWeightIndex.java 2009-03-03 14:34:49.000000000 +0000 @@ -25,7 +25,6 @@ */ package uk.ac.gla.terrier.indexing; import java.io.DataOutputStream; -import java.io.File; import java.io.IOException; import java.util.Arrays; @@ -33,17 +32,17 @@ import uk.ac.gla.terrier.matching.models.languagemodel.LanguageModel; import uk.ac.gla.terrier.structures.CollectionStatistics; -import uk.ac.gla.terrier.structures.Index; import uk.ac.gla.terrier.structures.DirectIndex; import uk.ac.gla.terrier.structures.DocumentIndex; +import uk.ac.gla.terrier.structures.Index; import uk.ac.gla.terrier.structures.InvertedIndex; import uk.ac.gla.terrier.structures.Lexicon; import uk.ac.gla.terrier.structures.indexing.DocumentInitialWeightIndex; import uk.ac.gla.terrier.structures.indexing.TermEstimateIndex; +import uk.ac.gla.terrier.utility.ApplicationSetup; import uk.ac.gla.terrier.utility.Files; import uk.ac.gla.terrier.utility.Rounding; import uk.ac.gla.terrier.utility.TerrierTimer; -import uk.ac.gla.terrier.utility.ApplicationSetup; /** * This class creates the initial weight index of all * documents in the collection. This is done for @@ -63,7 +62,7 @@ protected InvertedIndex invIndex; /** The Lexicon for retrieval. */ - protected Lexicon lexicon; + protected Lexicon lexicon; /** The DirectIndex for retrieval. */ protected DirectIndex directIndex; @@ -89,16 +88,9 @@ /** The data structure of the term esitmates. */ protected TermEstimateIndex teIndex; - /** - * The default constructor of CreateDocumentInitialWeightIndex. - * @param modelName The name of the applied language model. - */ - public CreateDocumentInitialWeightIndex(String modelName) - { - this(Index.createIndex(), modelName); - } - public CreateDocumentInitialWeightIndex(Index i, String modelName) { + + public CreateDocumentInitialWeightIndex(Index i, String modelName) throws IOException { long startLoading = System.currentTimeMillis(); docIndex = i.getDocumentIndex(); lexicon = i.getLexicon(); @@ -148,9 +140,9 @@ TerrierTimer timer1 = new TerrierTimer(); timer1.start(); double[] TF = new double[(int)numberOfUniqueTerms]; - for (int i = 0; i < numberOfUniqueTerms; i++){ - lexicon.findTerm(i); - TF[i] = (double)lexicon.getTF(); + for (int i = 0; i < numberOfUniqueTerms; i++) + { + TF[i] = (double)lexicon.getLexiconEntry(i).getValue().getFrequency(); } timer1.setBreakPoint(); if(logger.isDebugEnabled()) { diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/indexing/CreateTermEstimateIndex.java src/uk/ac/gla/terrier/indexing/CreateTermEstimateIndex.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/indexing/CreateTermEstimateIndex.java 2009-01-28 20:16:46.000000000 +0000 +++ src/uk/ac/gla/terrier/indexing/CreateTermEstimateIndex.java 2009-03-03 14:34:49.000000000 +0000 @@ -25,8 +25,9 @@ */ package uk.ac.gla.terrier.indexing; import java.io.DataOutputStream; -import java.io.File; import java.io.IOException; +import java.util.Iterator; +import java.util.Map; import org.apache.log4j.Logger; @@ -36,11 +37,11 @@ import uk.ac.gla.terrier.structures.Index; import uk.ac.gla.terrier.structures.InvertedIndex; import uk.ac.gla.terrier.structures.Lexicon; -import uk.ac.gla.terrier.structures.indexing.TermEstimateIndex; +import uk.ac.gla.terrier.structures.LexiconEntry; +import uk.ac.gla.terrier.utility.ApplicationSetup; import uk.ac.gla.terrier.utility.Files; import uk.ac.gla.terrier.utility.Rounding; import uk.ac.gla.terrier.utility.TerrierTimer; -import uk.ac.gla.terrier.utility.ApplicationSetup; /** * This class creates the term estimate index of all terms in vocabulary. This is * done for language modeling approach. @@ -121,9 +122,10 @@ * Create the TermEstimateIndex. It computes the average term generation probability for each term in the vocabulary of the collection. * */ + @SuppressWarnings("unchecked") public void createTermEstimateIndex(){ TerrierTimer timer = null; - long numberOfUniqueTerms = collectionStatistics.getNumberOfUniqueTerms(); + int numberOfUniqueTerms = collectionStatistics.getNumberOfUniqueTerms(); if(logger.isInfoEnabled()){ logger.info("number of unique terms: " + numberOfUniqueTerms); logger.info("Creating TermEstimateIndex..."); @@ -133,10 +135,15 @@ timer.setTotalNumber((double)numberOfUniqueTerms); timer.start(); } - termEstimates = new double[(int)numberOfUniqueTerms]; - for (int i = 0; i < numberOfUniqueTerms; i++){ - lexicon.seekEntry(i); - int[][] pointers = invIndex.getDocuments(i); + termEstimates = new double[numberOfUniqueTerms]; + + Iterator> lexiconStream = + (Iterator>)index.getIndexStructureInputStream("lexicon"); + int i=0; + while(lexiconStream.hasNext()) + { + Map.Entry lee = lexiconStream.next(); + int[][] pointers = invIndex.getDocuments(lee.getValue()); int[] docids = pointers[0]; int[] tf = pointers[1]; double[] docLength = new double[tf.length]; @@ -148,17 +155,18 @@ if(logger.isDebugEnabled()){ if ((i+1) % 10000 == 0){ timer.setRemainingTime((i+1)); - logger.debug("term: " + lexicon.getTerm() + - ", TF: " + lexicon.getTF() +", " + + logger.debug("term: " + lee.getKey() + + ", TF: " + lee.getValue().getFrequency() +", " + Rounding.toString((double)(i+1)/numberOfUniqueTerms*100, 2) + "% finished, time remaining: " + timer.toStringMinutesSeconds()); } } + i++; } try{ DataOutputStream output = new DataOutputStream( Files.writeFileStream(INDEX_FILENAME)); - for (int i = 0; i < termEstimates.length; i++) + for (i = 0; i < termEstimates.length; i++) output.writeDouble(termEstimates[i]); output.close(); } diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/indexing/Indexer.java src/uk/ac/gla/terrier/indexing/Indexer.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/indexing/Indexer.java 2009-01-28 20:16:46.000000000 +0000 +++ src/uk/ac/gla/terrier/indexing/Indexer.java 2009-03-03 14:34:49.000000000 +0000 @@ -33,17 +33,17 @@ import org.apache.log4j.Logger; import uk.ac.gla.terrier.structures.Index; +import uk.ac.gla.terrier.structures.IndexUtil; import uk.ac.gla.terrier.structures.indexing.DirectIndexBuilder; import uk.ac.gla.terrier.structures.indexing.DocumentIndexBuilder; import uk.ac.gla.terrier.structures.indexing.InvertedIndexBuilder; import uk.ac.gla.terrier.structures.indexing.LexiconBuilder; import uk.ac.gla.terrier.structures.merging.BlockStructureMerger; import uk.ac.gla.terrier.structures.merging.StructureMerger; -import uk.ac.gla.terrier.terms.TermPipeline; import uk.ac.gla.terrier.terms.SkipTermPipeline; +import uk.ac.gla.terrier.terms.TermPipeline; import uk.ac.gla.terrier.utility.ApplicationSetup; import uk.ac.gla.terrier.utility.FieldScore; -import uk.ac.gla.terrier.utility.Files; /** * Properties: *
    @@ -63,15 +63,6 @@ /** the logger for this class */ protected static Logger logger = Logger.getRootLogger(); - protected static String[] indexFileSuffices = new String[]{ - ApplicationSetup.PROPERTIES_SUFFIX, - ApplicationSetup.IFSUFFIX, - ApplicationSetup.DF_SUFFIX, - ApplicationSetup.LEXICON_INDEX_SUFFIX, - ApplicationSetup.LEXICONSUFFIX, - ApplicationSetup.DOC_INDEX_SUFFIX, - ApplicationSetup.LEXICON_HASH_SUFFIX}; - protected boolean UTFIndexing = false; /** @@ -322,11 +313,10 @@ } else { - final String src = path + ApplicationSetup.FILE_SEPARATOR + prefix; - final String dest = path + ApplicationSetup.FILE_SEPARATOR + oldIndexPrefix; - for (String suffix: indexFileSuffices) - { - Files.rename(src+suffix, dest+suffix); + try{ + IndexUtil.renameIndex(path, prefix, path, oldIndexPrefix); + } catch (IOException ioe ) { + logger.error("Could not rename index", ioe); } } //restore the prefix @@ -374,19 +364,14 @@ sMerger.setNumberOfBits(FieldScore.FIELDS_COUNT); sMerger.mergeStructures(); - - String separator = ApplicationSetup.FILE_SEPARATOR; src1.close(); src2.close(); dst.close(); - //delete old indices - for(String suffix : indexFileSuffices) - { - Files.delete(index1[0]+separator+index1[1]+ suffix); + //delete old indices + try{ + IndexUtil.deleteIndex(index1[0], index1[1]); + IndexUtil.deleteIndex(index2[0], index2[1]); + } catch (IOException ioe) { + logger.warn("Could not delete merge input indices ", ioe); } - - for(String suffix : indexFileSuffices) - { - Files.delete(index2[0]+separator+index2[1]+ suffix); - } } /** Merge a series of indices, in pair-wise fashion @@ -414,11 +399,10 @@ logger.info("Done merging"); //rename the generated structures - String src = mpath + ApplicationSetup.FILE_SEPARATOR + mprefix+"_"+ (counterMerged-1); - String dest = mpath + ApplicationSetup.FILE_SEPARATOR + mprefix; - for (String suffix: indexFileSuffices) - { - Files.rename(src+suffix, dest+suffix); + try{ + IndexUtil.renameIndex(mpath, mprefix+"_"+ (counterMerged-1), mpath, mprefix); + } catch (IOException ioe) { + logger.error("Could not rename merged index", ioe); } } diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/indexing/hadoop/Hadoop_BasicSinglePassIndexer.java src/uk/ac/gla/terrier/indexing/hadoop/Hadoop_BasicSinglePassIndexer.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/indexing/hadoop/Hadoop_BasicSinglePassIndexer.java 2009-02-16 21:43:02.000000000 +0000 +++ src/uk/ac/gla/terrier/indexing/hadoop/Hadoop_BasicSinglePassIndexer.java 2009-03-03 14:34:49.000000000 +0000 @@ -30,9 +30,9 @@ import java.io.DataInputStream; import java.io.DataOutputStream; import java.io.IOException; +import java.util.ArrayList; import java.util.Iterator; import java.util.LinkedList; -import java.util.ArrayList; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; @@ -50,10 +50,12 @@ import uk.ac.gla.terrier.compression.BitOutputStream; import uk.ac.gla.terrier.indexing.BasicSinglePassIndexer; import uk.ac.gla.terrier.indexing.Document; +import uk.ac.gla.terrier.structures.BasicLexiconEntry; import uk.ac.gla.terrier.structures.DocumentIndexInputStream; import uk.ac.gla.terrier.structures.FilePosition; import uk.ac.gla.terrier.structures.Index; import uk.ac.gla.terrier.structures.LexiconOutputStream; +import uk.ac.gla.terrier.structures.MapFileLexiconOutputStream; import uk.ac.gla.terrier.structures.indexing.DocumentIndexBuilder; import uk.ac.gla.terrier.structures.indexing.DocumentPostingList; import uk.ac.gla.terrier.structures.indexing.singlepass.FieldPostingInRun; @@ -63,8 +65,8 @@ import uk.ac.gla.terrier.structures.indexing.singlepass.hadoop.HadoopRunWriter; import uk.ac.gla.terrier.structures.indexing.singlepass.hadoop.HadoopRunsMerger; import uk.ac.gla.terrier.structures.indexing.singlepass.hadoop.MapData; -import uk.ac.gla.terrier.structures.indexing.singlepass.hadoop.MapEmittedTerm; import uk.ac.gla.terrier.structures.indexing.singlepass.hadoop.MapEmittedPostingList; +import uk.ac.gla.terrier.structures.indexing.singlepass.hadoop.MapEmittedTerm; import uk.ac.gla.terrier.structures.indexing.singlepass.hadoop.SimpleDocumentIndexBuilder; import uk.ac.gla.terrier.utility.ApplicationSetup; import uk.ac.gla.terrier.utility.FieldScore; @@ -348,7 +350,7 @@ */ /** OutputStream for the Lexicon*/ - protected LexiconOutputStream lexstream; + protected LexiconOutputStream lexstream; /** runIterator factory being used to generate RunIterators */ protected HadoopRunIteratorFactory runIteratorF = null; /** records whether the reduce() has been called for the first time */ @@ -434,11 +436,11 @@ * flushed. * @param mapData - info about the runs(maps) and the flushes */ - public void startReduce(LinkedList mapData) + public void startReduce(LinkedList mapData) throws IOException { logger.info("The number of Reduce Tasks being used : "+jc.getNumReduceTasks()); ((HadoopRunsMerger)(super.merger)).beginMerge(mapData); - lexstream = createLexiconOutputStream(currentIndex.getPath(), currentIndex.getPrefix()); + lexstream = new MapFileLexiconOutputStream(this.currentIndex, "lexicon", BasicLexiconEntry.Factory.class); // Tell the merger how many to Reducers to merge for ((HadoopRunsMerger) merger).setNumReducers(jc.getNumReduceTasks()); } @@ -521,15 +523,15 @@ //2. the end of the inverted file merger.getBos().close(); currentIndex.addIndexStructure( - "inverted", - invertedIndexClass, - "uk.ac.gla.terrier.structures.Lexicon,java.lang.String,java.lang.String", - "lexicon,path,prefix"); + "inverted", + invertedIndexClass, + "uk.ac.gla.terrier.structures.Index,java.lang.String", + "index,structureName"); currentIndex.addIndexStructureInputStream( - "inverted", - invertedIndexInputStreamClass, - "java.lang.String,java.lang.String,uk.ac.gla.terrier.structures.LexiconInputStream", - "path,prefix,lexicon-inputstream"); + "inverted", + "uk.ac.gla.terrier.structures.InvertedIndexInputStream", + "uk.ac.gla.terrier.structures.Index,java.lang.String,java.util.Iterator", + "index,structureName,lexicon-inputstream"); currentIndex.setIndexProperty("num.inverted.fields.bits", ""+FieldScore.FIELDS_COUNT ); //3. document index @@ -546,17 +548,13 @@ //4. close the map phase indices for(Index i : sourceIndices) { - String path = i.getPath(); - String prefix = i.getPrefix(); i.close(); } //5. finalise the lexicon - int numTerms; - currentIndex.setIndexProperty("num.Terms",""+ (numTerms = lexstream.getNumberOfTermsWritten()) ); + currentIndex.setIndexProperty("num.Terms",""+ lexstream.getNumberOfTermsWritten() ); currentIndex.setIndexProperty("num.Tokens",""+lexstream.getNumberOfTokensWritten() ); currentIndex.setIndexProperty("num.Pointers",""+lexstream.getNumberOfPointersWritten() ); lexstream.close(); - this.createLexicon(numTerms); this.finishedInvertedIndexBuild(); currentIndex.flush(); } @@ -573,7 +571,7 @@ try{ tempRM.setBos(new BitOutputStream( currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR - + currentIndex.getPrefix() + ApplicationSetup.IFSUFFIX )); + + currentIndex.getPrefix() + ".inverted.bf")); } catch (IOException ioe) { ioe.printStackTrace(); } diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/indexing/hadoop/Hadoop_BlockSinglePassIndexer.java src/uk/ac/gla/terrier/indexing/hadoop/Hadoop_BlockSinglePassIndexer.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/indexing/hadoop/Hadoop_BlockSinglePassIndexer.java 2009-01-28 20:16:47.000000000 +0000 +++ src/uk/ac/gla/terrier/indexing/hadoop/Hadoop_BlockSinglePassIndexer.java 2009-03-03 14:34:49.000000000 +0000 @@ -265,7 +265,7 @@ try{ tempRM.setBos(new BitOutputStream( currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR - + currentIndex.getPrefix() + ApplicationSetup.IFSUFFIX )); + + currentIndex.getPrefix() + ".inverted.bf" )); } catch (IOException ioe) { ioe.printStackTrace(); } diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/matching/LMMatching.java src/uk/ac/gla/terrier/matching/LMMatching.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/matching/LMMatching.java 2009-01-28 20:16:47.000000000 +0000 +++ src/uk/ac/gla/terrier/matching/LMMatching.java 2009-03-03 14:34:49.000000000 +0000 @@ -32,6 +32,7 @@ import uk.ac.gla.terrier.matching.models.languagemodel.LanguageModel; import uk.ac.gla.terrier.matching.tsms.TermScoreModifier; import uk.ac.gla.terrier.structures.Index; +import uk.ac.gla.terrier.structures.LexiconEntry; import uk.ac.gla.terrier.structures.indexing.DocumentInitialWeightIndex; import uk.ac.gla.terrier.structures.indexing.TermEstimateIndex; import uk.ac.gla.terrier.utility.HeapSort; @@ -165,10 +166,10 @@ final int queryLength = queryTermStrings.length; for (int i = 0; i < queryLength; i++) { //we seek the query term in the lexicon - boolean found = lexicon.findTerm(queryTermStrings[i]); - //and if it is not found, we continue with the next term - if (!found) + LexiconEntry le = lexicon.getLexiconEntry(queryTermStrings[i]); + if (le == null) continue; + //because when the TreeNode is created, the term //code assigned is taken from //the TermCodes class, the assigned term code is @@ -176,31 +177,31 @@ //process. Therefore, at this point, the term //code should be updated with the one //stored in the lexicon file. - queryTerms.setTermProperty(queryTermStrings[i], lexicon.getTermId()); + queryTerms.setTermProperty(queryTermStrings[i], le.getTermId()); if(logger.isDebugEnabled()){ - logger.debug("" + (i + 1) + ": " + queryTermStrings[i].trim() + "(" + lexicon.getTermId() + ")"); + logger.debug("" + (i + 1) + ": " + queryTermStrings[i].trim() + "(" + le.getTermId() + ")"); } //the weighting model is prepared for assigning scores to documents - wmodel.setTermFrequency((double)lexicon.getTF()); - this.termFrequency[i] = (double)lexicon.getTF(); - this.termEstimates[i] = this.termEstimateIndex.getTermEstimateByTermid(lexicon.getTermId()); + wmodel.setTermFrequency((double)le.getFrequency()); + this.termFrequency[i] = (double)le.getFrequency(); + this.termEstimates[i] = this.termEstimateIndex.getTermEstimateByTermid(le.getTermId()); if(logger.isDebugEnabled()){ logger.debug( " with " - + lexicon.getNt() + + le.getDocumentFrequency() + " documents (TF is " - + lexicon.getTF() + + le.getFrequency() + ")."); } //check if the IDF is very low. if(logger.isInfoEnabled()){ - if (IGNORE_LOW_IDF_TERMS==true && docIndex.getNumberOfDocuments() < lexicon.getTF()) { + if (IGNORE_LOW_IDF_TERMS==true && docIndex.getNumberOfDocuments() < le.getFrequency()) { logger.info("query term " + queryTermStrings[i] + " has low idf - ignored from scoring."); continue; } } //the postings are beign read from the inverted file. - pointers = invertedIndex.getDocuments(queryTerms.getTermCode(queryTermStrings[i])); + pointers = invertedIndex.getDocuments(le); init_tf(i, pointers); diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/matching/Matching.java src/uk/ac/gla/terrier/matching/Matching.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/matching/Matching.java 2009-01-28 20:16:47.000000000 +0000 +++ src/uk/ac/gla/terrier/matching/Matching.java 2009-03-03 14:34:49.000000000 +0000 @@ -114,7 +114,7 @@ /** The document index used.*/ protected DocumentIndex docIndex; /** The lexicon used.*/ - protected Lexicon lexicon; + protected Lexicon lexicon; /** The inverted file.*/ protected InvertedIndex invertedIndex; /** The collection statistics */ @@ -333,18 +333,19 @@ //because when the TreeNode is created, the term code assigned is taken from //the TermCodes class, the assigned term code is only valid during the indexing //process. Therefore, at this point, the term code should be updated with the one - //stored in the lexicon file. - queryTerms.setTermProperty(queryTermStrings[i], lEntry.termId); + //stored in the lexicon file. + queryTerms.setTermProperty(queryTermStrings[i], lEntry); //the weighting model is prepared for assigning scores to documents wmodel.setKeyFrequency(queryTerms.getTermWeight(queryTermStrings[i])); - wmodel.setDocumentFrequency((double)lEntry.n_t); - wmodel.setTermFrequency((double)lEntry.TF); + wmodel.setDocumentFrequency((double)lEntry.getDocumentFrequency()); + wmodel.setTermFrequency((double)lEntry.getFrequency()); - logger.debug((i + 1) + ": " + queryTermStrings[i].trim() + " with " + lEntry.n_t + " documents (TF is " + lEntry.TF + ")."); + logger.debug((i + 1) + ": " + queryTermStrings[i].trim() + " with " + lEntry.getDocumentFrequency() + + " documents (TF is " + lEntry.getFrequency() + ")."); //check if the IDF is very low. - if (IGNORE_LOW_IDF_TERMS && docIndex.getNumberOfDocuments() < lEntry.TF) { + if (IGNORE_LOW_IDF_TERMS && docIndex.getNumberOfDocuments() < lEntry.getFrequency()) { logger.debug("query term " + queryTermStrings[i] + " has low idf - ignored from scoring."); continue; } diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/matching/MatchingQueryTerms.java src/uk/ac/gla/terrier/matching/MatchingQueryTerms.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/matching/MatchingQueryTerms.java 2009-01-28 20:16:47.000000000 +0000 +++ src/uk/ac/gla/terrier/matching/MatchingQueryTerms.java 2009-03-03 14:34:49.000000000 +0000 @@ -34,6 +34,7 @@ import uk.ac.gla.terrier.matching.dsms.DocumentScoreModifier; import uk.ac.gla.terrier.matching.tsms.TermScoreModifier; import uk.ac.gla.terrier.querying.parser.Query; +import uk.ac.gla.terrier.structures.TermStatistics; /** * Models a query used for matching documents. It is created * by creating an instance of this class, and then passing it as @@ -44,20 +45,22 @@ * @author Vassilis Plachouras, Craig Macdonald. * @version $Revision: 1.24 $ */ -public class MatchingQueryTerms implements Serializable,Cloneable{ - +public class MatchingQueryTerms implements Serializable,Cloneable +{ + private static final long serialVersionUID = -9134975387300425203L; /** The weight and the modifiers associated with a query term.*/ - protected static class QueryTermProperties implements Serializable{ - - + protected static class QueryTermProperties implements Serializable + { + private static final long serialVersionUID = 6327392687128896557L; + /** The weight of a query term. This is usually how many times the term occurred * in the query, but sometime may be altered if a weight has been specified on the * query term: eg QueryExpansion will do this, as will manually specifying a weight * on the unparsed query (example term1 term2^3). */ double weight; - /** The term code (identifier) of the query term.*/ - int termCode; + /** Info about the query term.*/ + TermStatistics stats; /** The term score modifiers associated with a particular query term.*/ ArrayList modifiers = new ArrayList(); @@ -70,8 +73,8 @@ * of a query term. * @param code int the term code of a query term. */ - public QueryTermProperties(int code) { - termCode = code; + public QueryTermProperties(TermStatistics _stats) { + stats = _stats; } /** @@ -106,9 +109,9 @@ * @param w double the weight of a query term. * @param code int the term code of a query term. */ - public QueryTermProperties(double w, int code) { + public QueryTermProperties(double w, TermStatistics _stats) { weight = w; - termCode = code; + stats = _stats; } /** @@ -117,9 +120,9 @@ * @param tsm TermScoreModifier the modifier associated with a query term. * @param code int the term code of a query term. */ - public QueryTermProperties(TermScoreModifier tsm, int code) { + public QueryTermProperties(TermScoreModifier tsm, TermStatistics _stats) { modifiers.add(tsm); - termCode = code; + stats = _stats; } /** @@ -128,15 +131,15 @@ * @param tsm TermScoreModifier the modifier associated with a query term. * @param code int the term code of a query term. */ - public QueryTermProperties(double w, TermScoreModifier tsm, int code) { + public QueryTermProperties(double w, TermScoreModifier tsm, TermStatistics _stats) { weight = w; modifiers.add(tsm); - termCode = code; + stats = _stats; } public Object clone() { - QueryTermProperties newO = new QueryTermProperties(weight, termCode); + QueryTermProperties newO = new QueryTermProperties(weight, stats); for (TermScoreModifier tsm : modifiers) newO.modifiers.add((TermScoreModifier)(tsm.clone())); return (Object)newO; @@ -144,7 +147,7 @@ public int hashCode() { - int hashCodeValue = termCode; + int hashCodeValue = stats.hashCode(); hashCodeValue += (new Double(weight)).hashCode(); for (TermScoreModifier tsm : modifiers) { @@ -275,16 +278,16 @@ } /** - * Sets the term integer identifier for the given query term. + * Sets the term statistics for the given query term. * @param term String the term for which the term identifier is set. - * @param code int the term identifier. + * @param stats TermStatistics the statistics of the term. */ - public void setTermProperty(String term, int code) { + public void setTermProperty(String term, TermStatistics stats) { QueryTermProperties properties = termProperties.get(term); if (properties == null) { - termProperties.put(term, new QueryTermProperties(code)); + termProperties.put(term, new QueryTermProperties(stats)); } else { - properties.termCode = code; + properties.stats = stats; } } @@ -364,11 +367,9 @@ * @return int the term code of the given query term, or -1 if the term * does not appear in the query. */ - public int getTermCode(String term) { - QueryTermProperties tp = (QueryTermProperties)termProperties.get(term); - if (tp!=null) - return tp.termCode; - return -1; + public TermStatistics getStatistics(String term) { + QueryTermProperties tp = termProperties.get(term); + return tp.stats; } /** @@ -379,7 +380,7 @@ * of the query. */ public TermScoreModifier[] getTermScoreModifiers(String term) { - QueryTermProperties tp = (QueryTermProperties)termProperties.get(term); + QueryTermProperties tp = termProperties.get(term); if (tp!=null) return (TermScoreModifier[])tp.modifiers.toArray(tmpTSM); return null; diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/matching/dsms/BlockScoreModifier.java src/uk/ac/gla/terrier/matching/dsms/BlockScoreModifier.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/matching/dsms/BlockScoreModifier.java 2009-01-28 20:16:48.000000000 +0000 +++ src/uk/ac/gla/terrier/matching/dsms/BlockScoreModifier.java 2009-03-03 14:34:49.000000000 +0000 @@ -65,7 +65,7 @@ if (invertedIndex instanceof BlockInvertedIndex && query.length() > 1 && query.length() < 5) { - Lexicon lexicon = index.getLexicon(); + Lexicon lexicon = index.getLexicon(); int[] docids = resultSet.getDocids(); double[] scores = resultSet.getScores(); @@ -118,7 +118,7 @@ continue; //double term1KeyFrequency = query.getTermWeight(term1); - double term1DocumentFrequency = (double)tEntry1.n_t; + double term1DocumentFrequency = (double)tEntry1.getDocumentFrequency(); //we seek the 2nd query term in the lexicon LexiconEntry tEntry2 = lexicon.getLexiconEntry(term2); @@ -126,7 +126,7 @@ if (tEntry1 == null) continue; //double term2KeyFrequency = query.getTermWeight(term2); - double term2DocumentFrequency = (double)tEntry2.n_t; + double term2DocumentFrequency = (double)tEntry2.getDocumentFrequency(); term1Pointers = invertedIndex.getDocuments(tEntry1); term1docids = term1Pointers[0]; diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/matching/dsms/PhraseScoreModifier.java src/uk/ac/gla/terrier/matching/dsms/PhraseScoreModifier.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/matching/dsms/PhraseScoreModifier.java 2009-01-28 20:16:48.000000000 +0000 +++ src/uk/ac/gla/terrier/matching/dsms/PhraseScoreModifier.java 2009-03-03 14:34:49.000000000 +0000 @@ -26,22 +26,22 @@ */ package uk.ac.gla.terrier.matching.dsms; -import gnu.trove.TIntArrayList; import gnu.trove.TIntIntHashMap; import java.util.ArrayList; import java.util.Arrays; +import org.apache.log4j.Logger; + import uk.ac.gla.terrier.matching.MatchingQueryTerms; import uk.ac.gla.terrier.matching.ResultSet; import uk.ac.gla.terrier.querying.parser.SingleTermQuery; import uk.ac.gla.terrier.structures.BlockInvertedIndex; import uk.ac.gla.terrier.structures.Index; import uk.ac.gla.terrier.structures.InvertedIndex; +import uk.ac.gla.terrier.structures.LexiconEntry; import uk.ac.gla.terrier.utility.ApplicationSetup; -import org.apache.log4j.Logger; - /** * Modifies the scores of the documents which contain, or do not contain a given * phrase. @@ -186,33 +186,33 @@ for (int i = 0; i < phraseLength; i++) { docidsMap[i] = new TIntIntHashMap(); String t = ((SingleTermQuery) phraseTerms.get(i)).getTerm(); - if (terms.getTermCode(t) == -1) { - index.getLexicon().findTerm(t); - int termCode = index.getLexicon().getTermId(); - terms.setTermProperty(t, termCode); + if (terms.getStatistics(t) == null) + { + LexiconEntry le = index.getLexicon().getLexiconEntry(t); + if (le == null) + continue; + terms.setTermProperty(t, le); } - int termCode = terms.getTermCode(t); - if (termCode != -1) { - //for each phrase term, we store the identifiers of - //documents that contain that term in a hashmap - //we also convert the block frequencies into - //indexes for the block ids array, so that we - //can obtain easily the block ids of a phrase - //term for each document. - // - //For j-th document in the postings lists postings[i] - //the positions start at postings[i][4][postings[i][3][j-1]] - //and end at postings[i][4][postings[i][3][j]-1] - postings[i] = invIndex.getDocuments(terms.getTermCode(t)); - - for (int j = 0; j < postings[i][0].length; j++) { - //note that the entries in the docidsMap hash sets have - //been increased by one - docidsMap[i].put(postings[i][0][j] + 1, j + 1); - if (j > 0) - postings[i][3][j] += postings[i][3][j - 1]; - } + + //for each phrase term, we store the identifiers of + //documents that contain that term in a hashmap + //we also convert the block frequencies into + //indexes for the block ids array, so that we + //can obtain easily the block ids of a phrase + //term for each document. + // + //For j-th document in the postings lists postings[i] + //the positions start at postings[i][4][postings[i][3][j-1]] + //and end at postings[i][4][postings[i][3][j]-1] + postings[i] = invIndex.getDocuments((LexiconEntry)terms.getStatistics(t)); + + for (int j = 0; j < postings[i][0].length; j++) { + //note that the entries in the docidsMap hash sets have + //been increased by one + docidsMap[i].put(postings[i][0][j] + 1, j + 1); + if (j > 0) + postings[i][3][j] += postings[i][3][j - 1]; } } diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BasicLexiconEntry.java src/uk/ac/gla/terrier/structures/BasicLexiconEntry.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BasicLexiconEntry.java 1970-01-01 01:00:00.000000000 +0100 +++ src/uk/ac/gla/terrier/structures/BasicLexiconEntry.java 2009-03-03 14:34:49.000000000 +0000 @@ -0,0 +1,252 @@ +/* + * Terrier - Terabyte Retriever + * Webpage: http://ir.dcs.gla.ac.uk/terrier + * Contact: terrier{a.}dcs.gla.ac.uk + * University of Glasgow - Department of Computing Science + * http://www.gla.ac.uk/ + * + * The contents of this file are subject to the Mozilla Public License + * Version 1.1 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See + * the License for the specific language governing rights and limitations + * under the License. + * + * The Original Code is BlockDirectIndex.java. + * + * The Original Code is Copyright (C) 2004-2009 the University of Glasgow. + * All Rights Reserved. + * + * Contributor(s): + * Vassilis Plachouras (original author) + * Craig Macdonald + */ +package uk.ac.gla.terrier.structures; + +import static org.junit.Assert.assertEquals; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInput; +import java.io.DataInputStream; +import java.io.DataOutput; +import java.io.DataOutputStream; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; + +import org.apache.hadoop.io.Writable; +import org.junit.Test; + +import uk.ac.gla.terrier.structures.seralization.FixedSizeWriteableFactory; + +/** Contains all the information about one entry in the Lexicon. + * Created to make thread-safe lookups in the Lexicon easier. */ +public class BasicLexiconEntry extends LexiconEntry { + + public static class Factory implements FixedSizeWriteableFactory + { + public int getSize() { + //System.err.println("Value size is"+((3*4) + 8 + 1)); + return (3*4) + 8 + 1; + } + public LexiconEntry newInstance() { + return new BasicLexiconEntry(); + } + + public static class Tester + { + @Test public void testBasic() throws Exception + { + LexiconEntry le = new BasicLexiconEntry(); + //term id + le.setTermId(1); + assertEquals(le.getTermId(), 1); + //position + le.setPosition(0, (byte)0); + assertEquals(le.getBytes(), 0); + assertEquals(le.getBits(), (byte)0); + + le = new BasicLexiconEntry(2, 2, 5); + assertEquals(le.getDocumentFrequency(), 2); + assertEquals(le.getFrequency(), 5); + le.add(new BasicLexiconEntry(0,1,10)); + assertEquals(le.getDocumentFrequency(), 3); + assertEquals(le.getFrequency(), 15); + } + + @Test public void testWritable() throws Exception + { + Factory f = new Factory(); + LexiconEntry le = f.newInstance(); + le.setTermId(100); + le.setPosition(10, (byte)11); + byte[] b = getBytes(le); + System.err.println("le written in "+b.length+" bytes"); + assertEquals(b.length, f.getSize()); + LexiconEntry leReader = f.newInstance(); + populateEntry(leReader, b); + assertEquals(le.getTermId(), leReader.getTermId()); + assertEquals(le.getFrequency(), leReader.getFrequency()); + assertEquals(le.getDocumentFrequency(), leReader.getDocumentFrequency()); + assertEquals(le.getBytes(), leReader.getBytes()); + assertEquals(le.getBits(), leReader.getBits()); + } + + @Test public void testWritableFile() throws Exception + { + Factory f = new Factory(); + LexiconEntry le = f.newInstance(); + le.setTermId(100); + le.setPosition(10, (byte)11); + DataOutputStream dos = new DataOutputStream(new FileOutputStream("/tmp/testFile")); + le.write(dos); + dos.close(); + LexiconEntry leReader = f.newInstance(); + leReader.readFields(new DataInputStream(new FileInputStream("/tmp/testFile"))); + assertEquals(le.getTermId(), leReader.getTermId()); + assertEquals(le.getFrequency(), leReader.getFrequency()); + assertEquals(le.getDocumentFrequency(), leReader.getDocumentFrequency()); + assertEquals(le.getBytes(), leReader.getBytes()); + assertEquals(le.getBits(), leReader.getBits()); + new java.io.File("/tmp/testFile").delete(); + } + + static void populateEntry(LexiconEntry le, byte[] b) throws Exception + { + le.readFields(new DataInputStream(new ByteArrayInputStream(b))); + } + + static byte[] getBytes(Writable w) throws Exception + { + ByteArrayOutputStream buffer = new ByteArrayOutputStream(); + DataOutputStream dos = new DataOutputStream(buffer); + w.write(dos); + return buffer.toByteArray(); + } + } + } + + /** the termid of this entry */ + public int termId; + /** the number of document that this entry occurs in */ + public int n_t; + /** the total number of occurrences of the term in the index */ + public int TF; + /** the start offset of the entry in the inverted index */ + public long startOffset; + /** the start bit offset of the entry in the inverted index */ + public byte startBitOffset; + + /** Create an empty LexiconEntry */ + public BasicLexiconEntry(){} + + /** Create a lexicon entry with the following information. + * @param tid the term id + * @param n_t the number of documents the term occurs in (document frequency) + * @param TF the total count of therm t in the collection + */ + public BasicLexiconEntry(int tid, int n_t, int TF) + { + this.termId = tid; + this.n_t = n_t; + this.TF = TF; + } + + public BasicLexiconEntry(int tid, int n_t, int TF, long _startOffset, byte _startBitOffset) { + this.termId = tid; + this.n_t = n_t; + this.TF = TF; + this.startOffset = _startOffset; + this.startBitOffset = _startBitOffset; + } + + public BasicLexiconEntry(int tid, int n_t, int TF, BitFilePosition offset) { + this.termId = tid; + this.n_t = n_t; + this.TF = TF; + this.startOffset = offset.getBytes(); + this.startBitOffset = offset.getBits(); + } + + public void setStatistics(int n_t, int TF) + { + this.n_t = n_t; + this.TF = TF; + } + + /** increment this lexicon entry by another */ + public void add(TermStatistics le) + { + this.n_t += le.getDocumentFrequency(); + this.TF += le.getFrequency(); + } + + /** alter this lexicon entry to subtract another lexicon entry */ + public void subtract(TermStatistics le) + { + this.n_t -= le.getDocumentFrequency(); + this.TF -= le.getFrequency(); + } + + + /** returns a string representation of this lexicon entry */ + public String toString() { + return "term"+ termId + " Nt=" + n_t + " TF=" + TF + + " @{" + startOffset + " " + startBitOffset+"}"; + } + + public int getDocumentFrequency() { + return n_t; + } + + public int getFrequency() { + return TF; + } + + public int getTermId() { + return termId; + } + + public int getNumberOfEntries() { + return n_t; + } + + public byte getBits() { + return startBitOffset; + } + + public long getBytes() { + return startOffset; + } + + public void setTermId(int newTermId) + { + termId = newTermId; + } + + public void setPosition(long bytes, byte bits) + { + startOffset = bytes; + startBitOffset = bits; + } + + public void readFields(DataInput in) throws IOException { + termId = in.readInt(); + TF = in.readInt(); + n_t = in.readInt(); + startOffset = in.readLong(); + startBitOffset = in.readByte(); + } + + public void write(DataOutput out) throws IOException { + out.writeInt(termId); + out.writeInt(TF); + out.writeInt(n_t); + out.writeLong(startOffset); + out.writeByte(startBitOffset); + } +} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BasicTermStatsLexiconEntry.java src/uk/ac/gla/terrier/structures/BasicTermStatsLexiconEntry.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BasicTermStatsLexiconEntry.java 1970-01-01 01:00:00.000000000 +0100 +++ src/uk/ac/gla/terrier/structures/BasicTermStatsLexiconEntry.java 2009-03-03 14:34:49.000000000 +0000 @@ -0,0 +1,89 @@ +package uk.ac.gla.terrier.structures; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +public class BasicTermStatsLexiconEntry extends LexiconEntry { + protected int n_t; + protected int TF; + protected int termId; + + public BasicTermStatsLexiconEntry() {} + + public BasicTermStatsLexiconEntry(int _TF, int _n_t, int _termId) + { + TF = _TF; + n_t = _n_t; + termId = _termId; + } + + public int getDocumentFrequency() { + return n_t; + } + + public void setDocumentFrequency(int _n_t) { + n_t = _n_t; + } + + public int getFrequency() { + return TF; + } + + public void setFrequency(int _TF) { + TF = _TF; + } + + public int getTermId() { + return termId; + } + + public void setTermId(int _termId) { + termId = _termId; + } + + public void setAll(int _TF, int _n_t, int _termId) { + TF = _TF; + n_t = _n_t; + termId = _termId; + } + + public int getNumberOfEntries() { + return n_t; + } + + public byte getBits() { + return 0; + } + + public long getBytes() { + return 0; + } + + public void setPosition(long bytes, byte bits) + { + } + + public void readFields(DataInput in) throws IOException { + TF = in.readInt(); + n_t = in.readInt(); + termId = in.readInt(); + } + + public void write(DataOutput out) throws IOException { + out.writeInt(TF); + out.writeInt(n_t); + out.writeInt(termId); + } + + public void add(TermStatistics le) { + TF += le.getFrequency(); + n_t += le.getDocumentFrequency(); + } + + public void subtract(TermStatistics le) { + this.n_t -= le.getDocumentFrequency(); + this.TF -= le.getFrequency(); + } + +} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BitFilePosition.java src/uk/ac/gla/terrier/structures/BitFilePosition.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BitFilePosition.java 1970-01-01 01:00:00.000000000 +0100 +++ src/uk/ac/gla/terrier/structures/BitFilePosition.java 2009-03-03 14:34:49.000000000 +0000 @@ -0,0 +1,7 @@ +package uk.ac.gla.terrier.structures; +public interface BitFilePosition +{ + public long getBytes(); + public byte getBits(); + public void setPosition(long bytes, byte bits); +} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BitIndexPointer.java src/uk/ac/gla/terrier/structures/BitIndexPointer.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BitIndexPointer.java 1970-01-01 01:00:00.000000000 +0100 +++ src/uk/ac/gla/terrier/structures/BitIndexPointer.java 2009-03-03 14:34:49.000000000 +0000 @@ -0,0 +1,5 @@ +package uk.ac.gla.terrier.structures; + +public interface BitIndexPointer extends BitFilePosition { + public int getNumberOfEntries(); +} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BlockInvertedIndex.java src/uk/ac/gla/terrier/structures/BlockInvertedIndex.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BlockInvertedIndex.java 2009-01-28 20:16:54.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/BlockInvertedIndex.java 2009-03-03 14:34:49.000000000 +0000 @@ -34,34 +34,24 @@ /** * This class implements the block field inverted * index for performing retrieval. - * @author Douglas Johnson + * @author Douglas Johnson, Craig Macdonald et al. * @version $Revision: 1.32 $ */ public class BlockInvertedIndex extends InvertedIndex implements IndexConfigurable { protected int DocumentBlockCountDelta = 1; protected BlockInvertedIndex() {} - /** - * Creates an instance of the BlockInvertedIndex class - * using the given lexicon. - * @param lexicon The lexicon used for retrieval - */ - public BlockInvertedIndex(Lexicon lexicon) { - super(lexicon); - } - - public BlockInvertedIndex(Lexicon lexicon, String path, String prefix) { - super(lexicon, path, prefix); + public BlockInvertedIndex(Index index, String structureName) { + super(index, structureName); } /** * Creates an instance of the BlockInvertedIndex class * using the given lexicon. - * @param lexicon The lexicon used for retrieval * @param filename the name of the inverted file */ - public BlockInvertedIndex(Lexicon lexicon, String filename) { - super(lexicon, filename); + public BlockInvertedIndex(String filename) { + super(filename); } /** let it know which index to use */ @@ -70,38 +60,7 @@ DocumentBlockCountDelta = i.getIntIndexProperty("blocks.invertedindex.countdelta", 1); } - /** - * Prints out the block inverted index file. - */ - public void print() { - for (int i = 0; i < lexicon.getNumberOfLexiconEntries(); i++) { - lexicon.findTerm(i); - System.out.print("Term ("+lexicon.getTerm()+","+i+") : "); - int[][] documents = getDocuments(i); - int blockindex = 0; - for (int j = 0; j < documents[0].length; j++) { - System.out.print( - "(" - + documents[0][j] - + ", " - + documents[1][j] - + ", "); - if (FieldScore.USE_FIELD_INFORMATION) - { - System.out.print(documents[2][j] - + ", "); - } - System.out.print( documents[3][j]); - - for (int k = 0; k < documents[3][j]; k++) { - System.out.print(", B" + documents[4][blockindex]); - blockindex++; - } - System.out.print(")"); - } - System.out.println(); - } - } + /** * Returns a 2D array containing the document ids, * the term frequencies, the field scores the block frequencies and @@ -111,14 +70,15 @@ * frequencies, while the last vector contains the * block identifiers and it has a different length from * the document identifiers. - * @param startOffset start byte of the postings in the inverted file - * @param startBitOffset start bit of the postings in the inverted file - * @param endOffset end byte of the postings in the inverted file - * @param endBitOffset end bit of the postings in the inverted file - * @param df the number of postings to expect + * @param pointer start byte and bit offset of the postings in the inverted file, + * together with number of postings to expect */ - public int[][] getDocuments(final long startOffset, final byte startBitOffset, final long endOffset, final byte endBitOffset, final int df) { + public int[][] getDocuments(BitIndexPointer pointer) { + + final long startOffset = pointer.getBytes(); + final byte startBitOffset = pointer.getBits(); + final int df = pointer.getNumberOfEntries(); final int fieldCount = FieldScore.FIELDS_COUNT; final boolean loadTagInformation = FieldScore.USE_FIELD_INFORMATION; @@ -131,8 +91,7 @@ final TIntArrayList blockids = new TIntArrayList(df); //ideally we'd have TF here try{ - - final BitIn file = this.file.readReset(startOffset, startBitOffset, endOffset, endBitOffset); + final BitIn file = this.file.readReset(startOffset, startBitOffset); if (loadTagInformation) { //if there are tag information to process //documentTerms[2] = new int[df]; @@ -196,247 +155,4 @@ return null; } } - - - //* @param termid the id of the term whose documents we are looking for. - //public int[][] getDocuments(int termid) { - /*public int[][] getDocuments(final long startOffset, final byte startBitOffset, final long endOffset, final byte endBitOffset, int df) { - - //boolean found = lexicon.findTerm(termid); - final byte startBitOffset = lexicon.getStartBitOffset(); - final long startOffset = lexicon.getStartOffset(); - final byte endBitOffset = lexicon.getEndBitOffset(); - final long endOffset = lexicon.getEndOffset(); - - final int FIELDS_COUNT = FieldScore.FIELDS_COUNT; - - // TODO use heuristics here like we do in InvertedIndex.java - // for setting a good guess of the arraylist sizes. - TIntArrayList temporaryDocids = new TIntArrayList(); - TIntArrayList temporaryTFs = new TIntArrayList(); - TIntArrayList temporaryFields = new TIntArrayList(); - TIntArrayList temporaryBlockFreq = new TIntArrayList(); - TIntArrayList temporaryBlockIds = new TIntArrayList(); - int previousDocid = -1; - - //ArrayList temporaryTerms = new ArrayList(); - //ArrayList temporaryBlockids = new ArrayList(); - //int blockcount = 0; - try{ - final BitIn file = this.file.readReset(startOffset, startBitOffset, endOffset, endBitOffset); - //boolean hasMore = false; - while (((file.getByteOffset() + startOffset) < endOffset) - || (((file.getByteOffset() + startOffset) == endOffset) - && (file.getBitOffset() < endBitOffset))) { - - temporaryDocids.add(previousDocid = file.readGamma() + previousDocid); - temporaryTFs.add(file.readUnary()); - temporaryFields.add(file.readBinary(FIELDS_COUNT)); - - /*int docId = file.readGamma(); - /int[] tmp = new int[4]; - tmp[0] = docId; - tmp[1] = file.readUnary(); - tmp[2] = file.readBinary(FIELDS_COUNT); - - final int blockfreq = file.readUnary(); - temporaryBlockFreq.add(blockfreq); - //tmp[3] = blockfreq; - //System.out.print("docid="+previousDocid + "blockfreq="+blockfreq); - - int[] tmp2 = new int[blockfreq]; - int previousBlockId = -1; - //System.out.print(" blocks="); - for (int i = 0; i < blockfreq; i++) { - tmp2[i] = previousBlockId = file.readGamma() + previousBlockId; - //System.out.print(previousBlockId + ","); - //blockcount++; - } - // System.out.println(""); - //temporaryTerms.add(tmp); - //temporaryBlockids.add(tmp2); - temporaryBlockIds.add(tmp2); - } - int[][] documentTerms = new int[5][]; - documentTerms[0] = temporaryDocids.toNativeArray(); //new int[temporaryTerms.size()]; - documentTerms[1] = temporaryTFs.toNativeArray(); //new int[temporaryTerms.size()]; - documentTerms[2] = temporaryFields.toNativeArray(); //new int[temporaryTerms.size()]; - documentTerms[3] = temporaryBlockFreq.toNativeArray(); //new int[temporaryTerms.size()]; - documentTerms[4] = temporaryBlockIds.toNativeArray(); //new int[blockcount]; - /* - documentTerms[0][0] = ((int[]) temporaryTerms.get(0))[0] - 1; - documentTerms[1][0] = ((int[]) temporaryTerms.get(0))[1]; - documentTerms[2][0] = ((int[]) temporaryTerms.get(0))[2]; - documentTerms[3][0] = ((int[]) temporaryTerms.get(0))[3]; - int[] blockids = ((int[]) temporaryBlockids.get(0)); - documentTerms[4][0] = blockids[0] - 1; - for (int i = 1; i < blockids.length; i++) { - documentTerms[4][i] = blockids[i] + documentTerms[4][i - 1]; - } - int blockindex = blockids.length; - if (documentTerms[0].length > 1) { - for (int i = 1; i < documentTerms[0].length; i++) { - int[] tmpMatrix = (int[]) temporaryTerms.get(i); - documentTerms[0][i] = tmpMatrix[0] + documentTerms[0][i - 1]; - documentTerms[1][i] = tmpMatrix[1]; - documentTerms[2][i] = tmpMatrix[2]; - documentTerms[3][i] = tmpMatrix[3]; - blockids = ((int[]) temporaryBlockids.get(i)); - documentTerms[4][blockindex] = blockids[0] - 1; - blockindex++; - for (int j = 1; j < blockids.length; j++) { - documentTerms[4][blockindex] = - blockids[j] + documentTerms[4][blockindex - 1]; - blockindex++; - } - } - } - return documentTerms; - }catch (IOException ioe) { - logger.error("Problem reading direct index", ioe); - return null; - } - }*/ - - - /*public int[][] getDocumentsWithoutBlocks(int termid, int startDocid, int endDocid) { - if (! lexicon.findTerm(termid)) - return null; - - byte startBitOffset = lexicon.getStartBitOffset(); - long startOffset = lexicon.getStartOffset(); - byte endBitOffset = lexicon.getEndBitOffset(); - long endOffset = lexicon.getEndOffset(); - // TODO use heuristics here like we do in InvertedIndex.java - // for setting a good guess of the arraylist sizes. - ArrayList temporaryTerms = new ArrayList(); - //int blockcount = 0; - try{ - final BitIn file = this.file.readReset(startOffset, startBitOffset, endOffset, endBitOffset); - //boolean hasMore = false; - final int fieldCount = FieldScore.FIELDS_COUNT; - while (((file.getByteOffset() + startOffset) < endOffset) - || (((file.getByteOffset() + startOffset) == endOffset) - && (file.getBitOffset() < endBitOffset))) { - int docId = file.readGamma(); - int[] tmp = new int[3]; - tmp[0] = docId; - tmp[1] = file.readUnary(); - tmp[2] = file.readBinary(fieldCount); - - //read the blocks, but dont save them - int blockfreq = file.readUnary(); - for (int i = 0; i < blockfreq; i++) { - file.readGamma(); - } - if (docId >= startDocid && docId <=endDocid){ - temporaryTerms.add(tmp); - } - } - int[][] documentTerms = new int[3][]; - if (temporaryTerms.size()>0){ - documentTerms[0] = new int[temporaryTerms.size()]; - documentTerms[1] = new int[temporaryTerms.size()]; - documentTerms[2] = new int[temporaryTerms.size()]; - - documentTerms[0][0] = ((int[]) temporaryTerms.get(0))[0] - 1; - documentTerms[1][0] = ((int[]) temporaryTerms.get(0))[1]; - documentTerms[2][0] = ((int[]) temporaryTerms.get(0))[2]; - - if (documentTerms[0].length > 1) { - for (int i = 1; i < documentTerms[0].length; i++) { - int[] tmpMatrix = (int[]) temporaryTerms.get(i); - documentTerms[0][i] = tmpMatrix[0] + documentTerms[0][i - 1]; - documentTerms[1][i] = tmpMatrix[1]; - documentTerms[2][i] = tmpMatrix[2]; - } - } - } - return documentTerms; - } catch (IOException ioe) { - logger.error("Problem reading direct index", ioe); - return null; - } - } - */ - public int[][] getDocuments(int termid) { - LexiconEntry lEntry = lexicon.getLexiconEntry(termid); - if (lEntry == null) - return null; - return getDocuments(lEntry.startOffset, - lEntry.startBitOffset, - lEntry.endOffset, - lEntry.endBitOffset, lEntry.n_t); - } - public int[][] getDocumentsWithoutBlocks(int termid) { - LexiconEntry lEntry = lexicon.getLexiconEntry(termid); - if (lEntry == null) - return null; - return getDocumentsWithoutBlocks(lEntry.startOffset, - lEntry.startBitOffset, - lEntry.endOffset, - lEntry.endBitOffset, lEntry.n_t); - } - - public int[][] getDocumentsWithoutBlocks(LexiconEntry lEntry) - { - return getDocumentsWithoutBlocks( - lEntry.startOffset, - lEntry.startBitOffset, - lEntry.endOffset, - lEntry.endBitOffset, lEntry.n_t); - } - - public int[][] getDocumentsWithoutBlocks(long startOffset, byte startBitOffset, long endOffset, byte endBitOffset, int df) - { - int[][] documentTerms = null; - try{ - final BitIn file = this.file.readReset(startOffset, startBitOffset, endOffset, endBitOffset); - final int fieldCount = FieldScore.FIELDS_COUNT; - final boolean loadTagInformation = FieldScore.USE_FIELD_INFORMATION; - if (loadTagInformation) { //if there are tag information to process - documentTerms = new int[3][df]; - documentTerms[0][0] = file.readGamma() - 1; - documentTerms[1][0] = file.readUnary(); - documentTerms[2][0] = file.readBinary(fieldCount); - //read the blocks, but dont save them - int blockfreq = file.readUnary() - DocumentBlockCountDelta; - for (int j = 0; j < blockfreq; j++) { - file.readGamma(); - } - for (int i = 1; i < df; i++) { - documentTerms[0][i] = file.readGamma() + documentTerms[0][i - 1]; - documentTerms[1][i] = file.readUnary(); - documentTerms[2][i] = file.readBinary(fieldCount); - //read the blocks, but dont save them - blockfreq = file.readUnary() - DocumentBlockCountDelta; - for (int j = 0; j < blockfreq; j++) { - file.readGamma(); - } - } - } else { //no tag information to process - documentTerms = new int[2][df]; - documentTerms[0][0] = file.readGamma() - 1; - documentTerms[1][0] = file.readUnary(); - //read the blocks, but dont save them - int blockfreq = file.readUnary() - DocumentBlockCountDelta; - for (int j = 0; j < blockfreq; j++) { - file.readGamma(); - } - for(int i = 1; i < df; i++){ - documentTerms[0][i] = file.readGamma() + documentTerms[0][i - 1]; - documentTerms[1][i] = file.readUnary(); - //read the blocks, but dont save them - blockfreq = file.readUnary() - DocumentBlockCountDelta; - for (int j = 0; j < blockfreq; j++) { - file.readGamma(); - } - } - } - return documentTerms; - } catch (IOException ioe) { - logger.error("Problem reading inverted index", ioe); - return null; - } - } } diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BlockInvertedIndexInputStream.java src/uk/ac/gla/terrier/structures/BlockInvertedIndexInputStream.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BlockInvertedIndexInputStream.java 2009-01-28 20:16:54.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/BlockInvertedIndexInputStream.java 2009-03-03 14:34:49.000000000 +0000 @@ -26,10 +26,12 @@ */ package uk.ac.gla.terrier.structures; -import java.io.IOException; -import uk.ac.gla.terrier.structures.LexiconInputStream; import gnu.trove.TIntArrayList; -import uk.ac.gla.terrier.compression.BitIn; + +import java.io.IOException; +import java.util.Iterator; +import java.util.Map; + import uk.ac.gla.terrier.utility.FieldScore; /** Reads a BlockInvertedIndex as a stream @@ -37,39 +39,19 @@ * @since 2.0 * @version $Revision: 1.4 $ */ -public class BlockInvertedIndexInputStream extends InvertedIndexInputStream implements IndexConfigurable +public class BlockInvertedIndexInputStream extends InvertedIndexInputStream { protected int DocumentBlockCountDelta = 1; - /** Make a new BlockInvertedIndexInputStream from the specified path/prefix combo. The LexiconInputStream - * is required to determine the offsets and the document frequency - ie number of postings for - * each term. */ - public BlockInvertedIndexInputStream(String path, String prefix, LexiconInputStream lis) throws IOException - { - super(path, prefix, lis); - } - - /** Make a new BlockInvertedIndexInputStream from the specified filename. The LexiconInputStream - * is required to determine the offsets and the document frequency - ie number of postings for - * each term. - * @param filename Location of the inverted file to open */ - public BlockInvertedIndexInputStream(String filename, LexiconInputStream lis) throws IOException - { - super(filename, lis); - } - - public BlockInvertedIndexInputStream(BitIn invFile, LexiconInputStream lis) throws IOException + + public BlockInvertedIndexInputStream(Index _index, String structureName, Iterator> positions) throws IOException { - super(invFile, lis); + super(_index, structureName, positions); + DocumentBlockCountDelta = _index.getIntIndexProperty("blocks.invertedindex.countdelta", 1); } - - /** let it know which index to use */ - public void setIndex(Index i) - { - DocumentBlockCountDelta = i.getIntIndexProperty("blocks.invertedindex.countdelta", 1); - } - - protected int[][] getNextDocuments(int df, long endByteOffset, byte endBitOffset) throws IOException { - final int fieldCount = FieldScore.FIELDS_COUNT; + + protected int[][] getNextDocuments(BitIndexPointer pointer) throws IOException { + final int df = pointer.getNumberOfEntries(); + final int fieldCount = FieldScore.FIELDS_COUNT; final boolean loadTagInformation = FieldScore.USE_FIELD_INFORMATION; final int[][] documentTerms = new int[5][]; @@ -144,7 +126,7 @@ try{ while((documents = getNextDocuments()) != null) { - System.out.print("tid"+i); + System.out.print(i+"th term:"); int blockindex = 0; for (int j = 0; j < documents[0].length; j++) { System.out.print( diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BlockLexicon.java src/uk/ac/gla/terrier/structures/BlockLexicon.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BlockLexicon.java 2009-01-28 20:16:54.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/BlockLexicon.java 1970-01-01 01:00:00.000000000 +0100 @@ -1,305 +0,0 @@ -/* - * Terrier - Terabyte Retriever - * Webpage: http://ir.dcs.gla.ac.uk/terrier - * Contact: terrier{a.}dcs.gla.ac.uk - * University of Glasgow - Department of Computing Science - * http://www.gla.ac.uk/ - * - * The contents of this file are subject to the Mozilla Public License - * Version 1.1 (the "License"); you may not use this file except in - * compliance with the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. - * - * The Original Code is BlockLexicon.java. - * - * The Original Code is Copyright (C) 2004-2009 the University of Glasgow. - * All Rights Reserved. - * - * Contributor(s): - * Douglas Johnson (original author) - * Vassilis Plachouras - */ -package uk.ac.gla.terrier.structures; -import java.io.File; -import java.io.IOException; -import java.util.Arrays; - -import org.apache.log4j.Logger; - -import uk.ac.gla.terrier.utility.ApplicationSetup; -import uk.ac.gla.terrier.utility.io.RandomDataOutput; -/** - * A lexicon class that saves the number of - * different blocks a term appears in. It is used only during - * creating the block inverted index. After the block inverted - * index has been created, the block lexicon is transformed into - * a lexicon. - * @author Douglas Johnson, Vassilis Plachouras - * @version $Revision: 1.33 $ - */ -public class BlockLexicon extends Lexicon { - /** The logger used */ - private static Logger logger = Logger.getRootLogger(); - /** The block frequency of the term */ - protected int blockFrequency; - /** - * The size in bytes of an entry in the lexicon file. An entry corresponds - * to a string, an int (termCode), an int (docf), an int (tf), a long (the - * offset of the end of the term's entry in bytes in the inverted file) and - * a byte (the offset in bits of the last byte of the term's entry in the - * inverted file. - */ - public static final int lexiconEntryLength = - ApplicationSetup.STRING_BYTE_LENGTH //the string representation - + 16 //the four integers - + 8 //the long - + 1; //the byte - /** - * A default constructor. - */ - public BlockLexicon() { - super(); - - try { - numberOfLexiconEntries = (int) (lexiconFile.length() / (long)BlockLexicon.lexiconEntryLength); - bufferInput.mark(3 * lexiconEntryLength); - } catch (IOException ioe) { - logger.fatal - ("Input/output exception while opening for reading the lexicon file. Stack trace follows",ioe); - } - inputStreamClass = BlockLexiconInputStream.class; - } - /** - * Constructs an instace of BlockLexicon and opens the corresponding file. - * @param lexiconName the name of the lexicon file. - */ - public BlockLexicon(String lexiconName) { - super(lexiconName); - try { - numberOfLexiconEntries = (int) (lexiconFile.length() / (long)BlockLexicon.lexiconEntryLength); - bufferInput.mark(3 * lexiconEntryLength); - } catch (IOException ioe) { - logger.fatal("Input/output exception while opening for reading the " + - "lexicon file. Stack trace follows",ioe); - } - inputStreamClass = BlockLexiconInputStream.class; - } - - public BlockLexicon(String path, String prefix) - { - this(path + ApplicationSetup.FILE_SEPARATOR + prefix + ApplicationSetup.LEXICONSUFFIX); - } - - /** - * Finds the term given its term code. - * - * @return true if the term is found, else return false - * @param termId - * the term's id - */ - public boolean findTerm(int termId) { - try { - idToOffsetFile.seek((long)termId * 8L); - long lexiconOffset = idToOffsetFile.readLong(); - if (lexiconOffset == 0) { - startOffset = 0; - startBitOffset = 0; - lexiconFile.seek(lexiconOffset); - lexiconFile.readFully(bt, 0, ApplicationSetup.STRING_BYTE_LENGTH); - term = new String(bt); - this.termId = lexiconFile.readInt(); - documentFrequency = lexiconFile.readInt(); - blockFrequency = lexiconFile.readInt(); - termFrequency = lexiconFile.readInt(); - endOffset = lexiconFile.readLong(); - endBitOffset = lexiconFile.readByte(); - return true; - } else { - lexiconFile.seek(lexiconOffset - 9L); - //goes to the lexicon offset minus the long offset and a byte - startOffset = lexiconFile.readLong(); - startBitOffset = lexiconFile.readByte(); - startBitOffset++; - if (startBitOffset == 8) { - startBitOffset = 0; - startOffset++; - } - lexiconFile.readFully(bt, 0, ApplicationSetup.STRING_BYTE_LENGTH); - term = new String(bt); - this.termId = lexiconFile.readInt(); - documentFrequency = lexiconFile.readInt(); - blockFrequency = lexiconFile.readInt(); - termFrequency = lexiconFile.readInt(); - endOffset = lexiconFile.readLong(); - endBitOffset = lexiconFile.readByte(); - return true; - } - } catch (IOException ioe) { - logger.fatal("Input/Output exception while reading the idToOffset file. Stack trace follows.",ioe); - } - return false; - } - /** - * Performs a binary search in the lexicon in order to locate the given - * term. If the term is located, the properties termCharacters, - * documentFrequency, termFrequency, startOffset, startBitOffset, endOffset - * and endBitOffset contain the values related to the term. - * - * @param _term the term to search for. - * @return true if the term is found, and false otherwise. - */ - public boolean findTerm(String _term) { - Arrays.fill(buffer, (byte) 0); - Arrays.fill(bt, (byte) 0); - //byte[] bt = _term.getBytes(); - final int termLength = ApplicationSetup.STRING_BYTE_LENGTH; - //int _termId = 0; - long low = -1; - long high = numberOfLexiconEntries; - long i; - while (high-low>1) { - - i = (long)(high+low)/2; - try { - lexiconFile.seek((long)i * (long)BlockLexicon.lexiconEntryLength); - lexiconFile.readFully(buffer, 0, termLength); - } catch (IOException ioe) { - logger.fatal( - "Input/Output exception while reading from lexicon file. Stack trace follows.",ioe); - } - - int compareResult = 0; - compareResult = _term.compareTo(new String(buffer).trim()); - - if (compareResult < 1) - high = i; - else - low = i; - } - if (high == numberOfLexiconEntries) - return false; - try { - lexiconFile.seek((long)high * (long)BlockLexicon.lexiconEntryLength); - lexiconFile.readFully(buffer, 0, termLength); - } catch (IOException ioe) { - logger.fatal( - "Input/Output exception while reading from lexicon file. Stack trace follows.",ioe); - } - - if (_term.compareTo(new String(buffer).trim())==0) { - try { - findTerm(lexiconFile.readInt()); - return true; - }catch(IOException ioe) { - logger.fatal("Input/Output exception while reading from lexicon file. Stack trace follows.",ioe); - } - } - return false; - } - - /** - * Returns the block frequency for the given term - * @return int The block frequency for the given term - */ - public int getBlockFrequency() { - return blockFrequency; - } - /** - * Seeks the i-th entry of the lexicon. - * @param i - * The index of the entry we are looking for. - * @return true if the entry was found, false otherwise. - */ - public boolean seekEntry(int i) { - try { - if (i > numberOfLexiconEntries) - return false; - if (i == 0) { - lexiconFile.seek((long)i * (long)lexiconEntryLength); - startOffset = 0; - startBitOffset = 0; - lexiconFile.readFully(buffer, 0, ApplicationSetup.STRING_BYTE_LENGTH); - term = new String(buffer); - termId = lexiconFile.readInt(); - documentFrequency = lexiconFile.readInt(); - blockFrequency = lexiconFile.readInt(); - termFrequency = lexiconFile.readInt(); - endOffset = lexiconFile.readLong(); - endBitOffset = lexiconFile.readByte(); - return true; - } else { - lexiconFile.seek((long)i * (long)lexiconEntryLength - (long)lexiconEntryLength - + (long)ApplicationSetup.STRING_BYTE_LENGTH + 12L); - startOffset = lexiconFile.readLong(); - startBitOffset = lexiconFile.readByte(); - startBitOffset++; - if (startBitOffset == 8) { - startBitOffset = 0; - startOffset++; - } - lexiconFile.readFully(buffer, 0, ApplicationSetup.STRING_BYTE_LENGTH); - term = new String(buffer); - termId = lexiconFile.readInt(); - documentFrequency = lexiconFile.readInt(); - blockFrequency = lexiconFile.readInt(); - termFrequency = lexiconFile.readInt(); - endOffset = lexiconFile.readLong(); - endBitOffset = lexiconFile.readByte(); - return true; - } - } catch (IOException ioe) { - logger.fatal("Input/Output exception while reading the idToOffset file. Stack trace follows.",ioe); - } - return false; - } - - /** - * In an already stored entry in the lexicon file, the information about the - * term frequency, the endOffset in bytes, and the endBitOffset in the last - * byte, is updated. The term is specified by the index of the entry. - * - * @return true if the information is updated properly, otherwise return - * false - * @param i the i-th entry - * @param frequency the term's Frequency - * @param endOffset the offset of the ending byte in the inverted file - * @param endBitOffset the offset in bits in the ending byte in the term's entry in - * inverted file - * @deprecated The BlockLexicon is used during indexing, but not during - * retrieval. - */ - public boolean updateEntry(int i, int frequency, long endOffset, - byte endBitOffset) { - if (! (lexiconFile instanceof RandomDataOutput)) - return false; - RandomDataOutput _lexiconFile = (RandomDataOutput)lexiconFile; - try { - long lexiconOffset = (long)i * (long)lexiconEntryLength; - //we seek the offset where the frequency should be writen - _lexiconFile.seek((long)lexiconOffset - + (long)ApplicationSetup.STRING_BYTE_LENGTH + 8L); - _lexiconFile.writeInt(frequency); - _lexiconFile.writeLong(endOffset); - _lexiconFile.writeByte(endBitOffset); - } catch (IOException ioe) { - logger.fatal("Input/Output exception while updating the lexicon file. Stack trace follows."); - } - return false; - } - - public static int numberOfEntries(File f) - { - return (int)(f.length()/ (long)lexiconEntryLength); - } - - public static int numberOfEntries(String filename) - { - return numberOfEntries(new File(filename)); - } - -} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BlockLexiconEntry.java src/uk/ac/gla/terrier/structures/BlockLexiconEntry.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BlockLexiconEntry.java 1970-01-01 01:00:00.000000000 +0100 +++ src/uk/ac/gla/terrier/structures/BlockLexiconEntry.java 2009-03-03 14:34:49.000000000 +0000 @@ -0,0 +1,69 @@ +package uk.ac.gla.terrier.structures; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +public class BlockLexiconEntry extends BasicLexiconEntry implements BlockTermStatistics { + int blockCount; + + public static class Factory extends BasicLexiconEntry.Factory + { + public int getSize() { + return super.getSize() + 4; + } + public LexiconEntry newInstance() { + return new BlockLexiconEntry(); + } + } + + public BlockLexiconEntry() { + super(); + } + + public BlockLexiconEntry(int tid, int n_t, int TF, BitFilePosition offset, int _blockCount) { + super(tid, n_t, TF, offset); + blockCount = _blockCount; + } + public BlockLexiconEntry(int tid, int n_t, int TF, long _startOffset, byte _startBitOffset, int _blockCount) { + super(tid, n_t, TF, _startOffset, _startBitOffset); + blockCount = _blockCount; + } + public BlockLexiconEntry(int tid, int n_t, int TF, int _blockCount) { + super(tid, n_t, TF); + blockCount = _blockCount; + } + /** @{inheritDoc} */ + public int getBlockCount() + { + return blockCount; + } + @Override + public void readFields(DataInput in) throws IOException { + super.readFields(in); + blockCount = in.readInt(); + + } + @Override + public void write(DataOutput out) throws IOException { + super.write(out); + out.writeInt(blockCount); + } + + @Override + public void add(TermStatistics le) + { + super.add(le); + if (le instanceof BlockTermStatistics) + blockCount += ((BlockTermStatistics)le).getBlockCount(); + } + + @Override + public void subtract(TermStatistics le) + { + super.subtract(le); + if (le instanceof BlockTermStatistics) + blockCount -= ((BlockTermStatistics)le).getBlockCount(); + } + +} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BlockLexiconInputStream.java src/uk/ac/gla/terrier/structures/BlockLexiconInputStream.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BlockLexiconInputStream.java 2009-01-28 20:16:54.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/BlockLexiconInputStream.java 1970-01-01 01:00:00.000000000 +0100 @@ -1,159 +0,0 @@ -/* - * Terrier - Terabyte Retriever - * Webpage: http://ir.dcs.gla.ac.uk/terrier - * Contact: terrier{a.}dcs.gla.ac.uk - * University of Glasgow - Department of Computing Science - * http://www.gla.ac.uk/ - * - * The contents of this file are subject to the Mozilla Public License - * Version 1.1 (the "License"); you may not use this file except in - * compliance with the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. - * - * The Original Code is BlockLexiconInputStream.java. - * - * The Original Code is Copyright (C) 2004-2009 the University of Glasgow. - * All Rights Reserved. - * - * Contributor(s): - * Douglas Johnson (original author) - * Vassilis Plachouras - */ -package uk.ac.gla.terrier.structures; -import java.io.*; - -import org.apache.log4j.Logger; - -import uk.ac.gla.terrier.utility.ApplicationSetup; -/** - * An input stream for accessing sequentially the entries - * of a block lexicon. - * @author Douglas Johnson, Vassilis Plachouras - * @version $Revision: 1.27 $ - */ -public class BlockLexiconInputStream extends LexiconInputStream { - /** The logger used */ - private static Logger logger = Logger.getRootLogger(); - - /** - * The total number of different blocks a term appears in. - */ - protected int blockFrequency; - /** - * A default constructor. - */ - public BlockLexiconInputStream() { - super(); - entrySize = BlockLexicon.lexiconEntryLength; - } - /** - * A constructor given the filename. - * @param filename java.lang.String the name of the lexicon file. - */ - public BlockLexiconInputStream(String filename) { - super(filename); - entrySize = BlockLexicon.lexiconEntryLength; - } - - public BlockLexiconInputStream(String path, String prefix) { - super(path, prefix); - entrySize = BlockLexicon.lexiconEntryLength; - } - /** - * A constructor given the filename. - * @param file java.io.File the name of the lexicon file. - */ - public BlockLexiconInputStream(File file) { - super(file); - entrySize = BlockLexicon.lexiconEntryLength; - } - - /** Read a lexicon from the specified input stream */ - public BlockLexiconInputStream(DataInput in) { - super(in); - entrySize = BlockLexicon.lexiconEntryLength; - } - /** - * Read the next lexicon entry. - * @return the number of bytes read if there is no error, - * otherwise returns -1 in case of EOF - * @throws java.io.IOException if an I/O error occurs - */ - public int readNextEntry() throws IOException { - try { - startBitOffset = (byte) (endBitOffset + 1); - startOffset = endOffset; - if (startBitOffset == 8) { - startOffset = endOffset + 1; - startBitOffset = 0; - } - lexiconStream.readFully( - termCharacters, - 0, - ApplicationSetup.STRING_BYTE_LENGTH); - - termId = lexiconStream.readInt(); - documentFrequency = lexiconStream.readInt(); - blockFrequency = lexiconStream.readInt(); - termFrequency = lexiconStream.readInt(); - endOffset = lexiconStream.readLong(); - endBitOffset = lexiconStream.readByte(); - numPointersRead += documentFrequency; - numTokensRead += termFrequency; - numTermsRead++; - return Lexicon.lexiconEntryLength; - } catch (EOFException eofe) { - return -1; - } - } - - /** - * Returns the number of entries in the lexicon file. - */ - public int numberOfEntries(){ - return (int)(lexiconFilelength / BlockLexicon.lexiconEntryLength); - } - - /** - * Prints out the contents of the lexicon file to check. - */ - public void print() { - int i = 0; //counter - int entryLength = Lexicon.lexiconEntryLength; - try { - while (readNextEntry() != -1) { - System.out.println( - "" - + (long)i * (long)entryLength - + ", " - + term.trim() - + ", " - + termId - + ", " - + documentFrequency - + ", " - + blockFrequency - + ", " - + termFrequency - + ", " - + endBitOffset); - i++; - } - } catch (IOException ioe) { - logger.error("Input/Output exception while reading the lexicon index input stream. ", ioe); - } - } - - /** - * Returns the block frequency for the currently processed term. - * @return int The block frequency for the currently processed term - */ - public int getBlockFrequency() { - return blockFrequency; - } -} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BlockLexiconOutputStream.java src/uk/ac/gla/terrier/structures/BlockLexiconOutputStream.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BlockLexiconOutputStream.java 2009-01-28 20:16:54.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/BlockLexiconOutputStream.java 1970-01-01 01:00:00.000000000 +0100 @@ -1,154 +0,0 @@ -/* - * Terrier - Terabyte Retriever - * Webpage: http://ir.dcs.gla.ac.uk/terrier - * Contact: terrier{a.}dcs.gla.ac.uk - * University of Glasgow - Department of Computing Science - * http://www.gla.ac.uk/ - * - * The contents of this file are subject to the Mozilla Public License - * Version 1.1 (the "License"); you may not use this file except in - * compliance with the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. - * - * The Original Code is BlockLexiconOutputStream.java. - * - * The Original Code is Copyright (C) 2004-2009 the University of Glasgow. - * All Rights Reserved. - * - * Contributor(s): - * Douglas Johnson (original author) - * Vassilis Plachouras - */ -package uk.ac.gla.terrier.structures; -import java.io.DataOutput; -import java.io.File; -import java.io.IOException; - -import uk.ac.gla.terrier.utility.ApplicationSetup; -/** - * An output stream for writing the lexicon to a file sequentially. - * @author Douglas Johnson, Vassilis Plachouras - * @version $Revision: 1.17 $ - */ -public class BlockLexiconOutputStream extends LexiconOutputStream { - /** A zero buffer for writing to the file.*/ - private static byte[] zeroBuffer = - new byte[ApplicationSetup.STRING_BYTE_LENGTH]; - /** - * The number of different blocks in which a term appears. - * This is used only during the creation of the inverted - * file and it can be ignored afterwards. - */ - protected int blockFrequency; - /** - * A default constructor. - */ - public BlockLexiconOutputStream() { - super(); - } - /** - * A constructor given the filename. - * @param filename java.lang.String the name of the lexicon file. - */ - public BlockLexiconOutputStream(String filename) { - super(filename); - } - /** - * A constructor given the file. - * @param file java.io.File the lexicon file. - */ - public BlockLexiconOutputStream(File file) { - super(file); - } - - /** Create a lexicon using the specified data stream */ - public BlockLexiconOutputStream(DataOutput out){ - super(out); - } - /** - * Write a lexicon entry. - * @return the number of bytes written if there is no error, otherwise returns -1 in case of EOF - * @throws IOException if an I/O error occurs - * @param term the string representation of the term - * @param termId the terms integer identifier - * @param documentFrequency the term's document frequency in the collection - * @param termFrequency the term's frequency in the collection - * @param endOffset the term's ending byte offset in the inverted file - * @param endBitOffset the term's ending byte bit-offset in the inverted file - */ - public int writeNextEntry( - String term, - int termId, - int documentFrequency, - int termFrequency, - int blockFrequency, - long endOffset, - byte endBitOffset) - throws IOException { - byte[] tmpBytes = term.getBytes(); - final int length = tmpBytes.length; - numPointersWritten += documentFrequency; - numTokensWritten += termFrequency; - numTermsWritten++; - lexiconStream.write(tmpBytes, 0, length); - lexiconStream.write( - zeroBuffer, - 0, - ApplicationSetup.STRING_BYTE_LENGTH - length); - lexiconStream.writeInt(termId); - lexiconStream.writeInt(documentFrequency); - lexiconStream.writeInt(blockFrequency); - lexiconStream.writeInt(termFrequency); - lexiconStream.writeLong(endOffset); - lexiconStream.writeByte(endBitOffset); - return BlockLexicon.lexiconEntryLength; - } - /** - * Write a lexicon entry. - * @return the number of bytes written if there is no error, otherwise returns -1 in case of EOF - * @throws java.io.IOException if an I/O error occurs - * @param term the byte array representation of the term - * @param termId the terms integer identifier - * @param documentFrequency the term's document frequency in the collection - * @param termFrequency the term's frequency in the collection - * @param endOffset the term's ending byte offset in the inverted file - * @param endBitOffset the term's ending byte bit-offset in the inverted file - */ - public int writeNextEntry( - byte[] term, - int termId, - int documentFrequency, - int blockFrequency, - int termFrequency, - long endOffset, - byte endBitOffset) - throws IOException { - final int length = term.length; - numPointersWritten += documentFrequency; - numTokensWritten += termFrequency; - lexiconStream.write(term, 0, term.length); - lexiconStream.write( - zeroBuffer, - 0, - ApplicationSetup.STRING_BYTE_LENGTH - length); - lexiconStream.writeInt(termId); - lexiconStream.writeInt(documentFrequency); - lexiconStream.writeInt(blockFrequency); - lexiconStream.writeInt(termFrequency); - lexiconStream.writeLong(endOffset); - lexiconStream.writeByte(endBitOffset); - return BlockLexicon.lexiconEntryLength; - } - /** - * Sets the block frequency for the given term - * @param blockFrequency The new block frequency - */ - public void setBF(int blockFrequency) { - this.blockFrequency = blockFrequency; - } -} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BlockTermStatistics.java src/uk/ac/gla/terrier/structures/BlockTermStatistics.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/BlockTermStatistics.java 1970-01-01 01:00:00.000000000 +0100 +++ src/uk/ac/gla/terrier/structures/BlockTermStatistics.java 2009-03-03 14:34:49.000000000 +0000 @@ -0,0 +1,7 @@ +package uk.ac.gla.terrier.structures; + +public interface BlockTermStatistics extends TermStatistics { + /** The number of blocks that this term has. + * Needed by the BlockInvertedIndexBuilder. */ + public int getBlockCount(); +} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/ExpansionTerms.java src/uk/ac/gla/terrier/structures/ExpansionTerms.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/ExpansionTerms.java 2009-01-28 20:16:55.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/ExpansionTerms.java 2009-03-03 14:34:49.000000000 +0000 @@ -31,6 +31,8 @@ import gnu.trove.TIntHashSet; import gnu.trove.TIntObjectHashMap; +import java.util.Map; + import org.apache.log4j.Logger; import uk.ac.gla.terrier.matching.MatchingQueryTerms; @@ -55,7 +57,7 @@ /** The terms in the top-retrieval documents. */ protected TIntObjectHashMap terms; /** The lexicon used for retrieval. */ - protected Lexicon lexicon; + protected Lexicon lexicon; /** The number of documents in the collection. */ protected int numberOfDocuments; /** The number of tokens in the collection. */ @@ -167,7 +169,7 @@ * @param totalLength The sum of the length of the top-retrieved documents. * @param lexicon Lexicon The lexicon used for retrieval. */ - public ExpansionTerms(CollectionStatistics collStats, double totalLength, Lexicon lexicon) { + public ExpansionTerms(CollectionStatistics collStats, double totalLength, Lexicon lexicon) { this( collStats.getNumberOfDocuments(), collStats.getNumberOfTokens(), @@ -186,7 +188,7 @@ long numberOfTokens, double averageDocumentLength, double totalLength, - Lexicon lexicon) { + Lexicon lexicon) { this.numberOfDocuments = numberOfDocuments; this.numberOfTokens = numberOfTokens; this.averageDocumentLength = averageDocumentLength; @@ -255,10 +257,10 @@ } double TF = 0; - double Nt = 0; - lexicon.findTerm(allTerms[i].getTermID()); - TF = lexicon.getTF(); - Nt = lexicon.getNt(); + //double Nt = 0; + TermStatistics ts = lexicon.getLexiconEntry(allTerms[i].getTermID()).getValue(); + TF = ts.getFrequency(); + //Nt = ts.getDocumentFrequency(); allTerms[i].setWeightExpansion(QEModel.score( allTerms[i].getWithinDocumentFrequency(), TF @@ -283,9 +285,9 @@ logger.info("parameter free query expansion."); } } - lexicon.findTerm(allTerms[posMaxWeight].termID); if(logger.isDebugEnabled()){ - logger.debug("term with the maximum weight: " + lexicon.getTerm() + + String term = lexicon.getLexiconEntry(allTerms[posMaxWeight].termID).getKey(); + logger.debug("term with the maximum weight: " + term + ", normaliser: " + Rounding.toString(normaliser, 4)); } THashSet expandedTerms = new THashSet(); @@ -303,8 +305,8 @@ allTerms[i] = temp; } - lexicon.findTerm(allTerms[i].getTermID()); - final SingleTermQuery expandedTerm = new SingleTermQuery(lexicon.getTerm());//new TermTreeNode(lexicon.getTerm()); + String term = lexicon.getLexiconEntry(allTerms[i].getTermID()).getKey(); + final SingleTermQuery expandedTerm = new SingleTermQuery(term); expandedTerm.setWeight(allTerms[i].getWeightExpansion()/normaliser); @@ -323,11 +325,11 @@ if (weighedOriginalTermsCount==originalTerms.size()) break; - lexicon.findTerm(allTerms[i].getTermID()); - if (!originalTerms.contains(lexicon.getTerm())) + String term = lexicon.getLexiconEntry(allTerms[i].getTermID()).getKey(); + if (!originalTerms.contains(term)) continue; weighedOriginalTermsCount++; - final SingleTermQuery expandedTerm = new SingleTermQuery(lexicon.getTerm());//new TermTreeNode(lexicon.getTerm()); + final SingleTermQuery expandedTerm = new SingleTermQuery(term); expandedTerm.setWeight(allTerms[i].getWeightExpansion()/normaliser); //expandedTerms[i].normalisedFrequency = //terms[i].getWeightExpansion()/normaliser; @@ -350,7 +352,7 @@ this.originalTerms.clear(); for (int i=0; i lse = lexicon.getLexiconEntry(termId); + TF = lse.getValue().getFrequency(); + //Nt = lse.getValue().getDocumentFrequency(); + score = model.score(((ExpansionTerm)o).getWithinDocumentFrequency(), TF, this.totalDocumentLength, diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/FilePosition.java src/uk/ac/gla/terrier/structures/FilePosition.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/FilePosition.java 2009-01-28 20:16:55.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/FilePosition.java 2009-03-03 14:34:49.000000000 +0000 @@ -38,7 +38,7 @@ * a bit offset of 2. * @author Craig Macdonald, Vassilis Plachouras & John Kane */ -public class FilePosition +public class FilePosition implements BitFilePosition { /** The number of bytes a file position could be converted to * - 8 for the byte's long, 1 for the bits @@ -83,6 +83,15 @@ Bits = in.Bits; } + public long getBytes() { return Bytes; } + public byte getBits() { return Bits; } + + public void setPosition(long bytes, byte bits) + { + Bytes = bytes; + Bits = bits; + } + /** How large is this object when serialized */ public static int sizeInBytes() { diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/Index.java src/uk/ac/gla/terrier/structures/Index.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/Index.java 2009-01-28 20:16:55.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/Index.java 2009-03-03 14:34:49.000000000 +0000 @@ -377,7 +377,9 @@ String structureClassName = properties.getProperty("index."+structureName+".class"); if (structureClassName == null) { - logger.error("This index ("+this.toString()+") doesnt have an index structure called "+ structureName); + logger.error("This index ("+this.toString()+") doesnt have an index structure called "+ structureName + + ": property index."+structureName+".class not found"); + logger.error(properties.toString()); return null;//TODO exceptions? } //obtain the class definition for the index structure @@ -385,7 +387,8 @@ try{ indexStructureClass = Class.forName(structureClassName, false, this.getClass().getClassLoader()); } catch (ClassNotFoundException cnfe) { - logger.error("This index ("+this.toString()+") references an unknown index structure class: "+structureName+ " looking for "+ structureClassName); + logger.error("ClassNotFoundException: This index ("+this.toString()+") references an unknown index structure class: "+structureName+ " looking for "+ structureClassName); + cnfe.printStackTrace(); return null;//TODO exceptions? } @@ -416,8 +419,20 @@ objs[i] = prefix; else if (p.equals("index")) objs[i] = this; + else if (p.equals("structureName")) + { + final String tmp = structureName; + objs[i] = tmp.replaceAll("-inputstream$", ""); + } else if (p.endsWith("-inputstream"))//no caching for input streams objs[i] = loadIndexStructure(p); + else if (p.matches("^\\$\\{.+\\}$")) + { + String propertyName = p.substring(2,p.length()-1); + objs[i] = properties.getProperty(propertyName, ApplicationSetup.getProperty("max.term.length", ""+20)); + if (objs[i] == null) + throw new IllegalArgumentException("Property "+propertyName+" not found"); + } else objs[i] = getIndexStructure(p); i++; @@ -492,7 +507,7 @@ } try{ final OutputStream outputStream = Files.writeFileStream(propertiesFilename); - properties.store(outputStream,""); + properties.store(outputStream,this.toString()); outputStream.close(); } catch (IOException ioe) { logger.warn("Could not write to index properties at "+propertiesFilename + " - some changes may be lost", ioe); @@ -517,9 +532,10 @@ return (DirectIndex)getIndexStructure("direct"); } /** Return the Lexicon associated with this index */ - public Lexicon getLexicon() + @SuppressWarnings("unchecked") + public Lexicon getLexicon() { - return (Lexicon)getIndexStructure("lexicon"); + return (Lexicon)getIndexStructure("lexicon"); } /** Return the DocumentIndex associated with this index */ public DocumentIndex getDocumentIndex() diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/IndexUtil.java src/uk/ac/gla/terrier/structures/IndexUtil.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/IndexUtil.java 1970-01-01 01:00:00.000000000 +0100 +++ src/uk/ac/gla/terrier/structures/IndexUtil.java 2009-03-03 14:34:49.000000000 +0000 @@ -0,0 +1,39 @@ +package uk.ac.gla.terrier.structures; + +import java.io.IOException; +import uk.ac.gla.terrier.utility.Files; + +public class IndexUtil { + + /** Move an index from one location to another */ + public static void renameIndex(String srcPath, String srcPrefix, String dstPath, String dstPrefix) + throws IOException + { + final String actualPrefix = srcPrefix +'.'; + for (String filename : Files.list(srcPath)) + { + //System.err.println("Checking "+filename); + if (filename.startsWith(actualPrefix)) + { + final String newFilename = filename.replaceFirst(srcPrefix, dstPrefix); + Files.rename(srcPath + "/" + filename, dstPath+"/"+ newFilename); + } + } + } + + /** Delete an existing index */ + public static void deleteIndex(String path, String prefix) + throws IOException + { + final String actualPrefix = prefix +'.'; + for (String filename : Files.list(path)) + { + if (filename.startsWith(actualPrefix)) + { + + Files.delete(path + "/" + filename); + } + } + } + +} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/InvertedIndex.java src/uk/ac/gla/terrier/structures/InvertedIndex.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/InvertedIndex.java 2009-01-28 20:16:55.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/InvertedIndex.java 2009-03-03 14:34:49.000000000 +0000 @@ -27,14 +27,13 @@ */ package uk.ac.gla.terrier.structures; import java.io.IOException; -import java.util.ArrayList; + import org.apache.log4j.Logger; -import uk.ac.gla.terrier.compression.BitFile; +import uk.ac.gla.terrier.compression.BitFileBuffered; import uk.ac.gla.terrier.compression.BitIn; import uk.ac.gla.terrier.compression.BitInSeekable; import uk.ac.gla.terrier.compression.OldBitFile; -import uk.ac.gla.terrier.utility.ApplicationSetup; import uk.ac.gla.terrier.utility.FieldScore; /** * This class implements the inverted index @@ -67,43 +66,29 @@ /** Filename of the open file */ protected String filename; - /** - * The lexicon used for retrieving documents. - */ - protected Lexicon lexicon; - /** A constructor for child classes that doesnt open the file */ protected InvertedIndex(long a, long b, long c) { } + + /** A default constructor, only for use by child classes */ protected InvertedIndex() { } - - public InvertedIndex(Lexicon lexicon, String path, String prefix) + + public InvertedIndex(Index index, String structureName) { - this(lexicon, path + ApplicationSetup.FILE_SEPARATOR + prefix + ApplicationSetup.IFSUFFIX); + this(index.getPath() + "/" + index.getPrefix() + "." + structureName + ".bf"); } /** - * Creates an instance of the HtmlInvertedIndex class using the lexicon. - * @param lexicon The lexicon used for retrieval - */ - public InvertedIndex(Lexicon lexicon) { - this(lexicon, ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX); - //file = new BitFile(ApplicationSetup.INVERTED_FILENAME, "r"); - //this.lexicon = lexicon; - } - /** * Creates an instance of the HtmlInvertedIndex class using the given * lexicon. - * @param lexicon The lexicon used for retrieval * @param filename The name of the inverted file */ - public InvertedIndex(Lexicon lexicon, String filename) { - file = new BitFile(this.filename = filename, "r"); - this.lexicon = lexicon; + public InvertedIndex(String filename) { + file = new BitFileBuffered(this.filename = filename); } /** forces the data structure to reopen the underlying bitfile * using the legacy implementation of BitFile (OldBitFile) @@ -115,350 +100,50 @@ file = new OldBitFile(filename, "r"); } - /** - * Prints out the inverted index file. - */ - public void print() { - for (int i = 0; i < lexicon.getNumberOfLexiconEntries(); i++) { - int[][] documents = getDocuments(i); - System.out.print("tid"+i); - if (useFieldInformation) { - for (int j = 0; j < documents[0].length; j++) { - System.out.print("(" + documents[0][j] + ", " + documents[1][j] - + ", F" + documents[2][j] + ") "); - } - System.out.println(); - } else { - for (int j = 0; j < documents[0].length; j++) { - System.out.print("(" + documents[0][j] + ", " - + documents[1][j] + ") "); - } - System.out.println(); - } - } - } - - public int[][] getDocuments(LexiconEntry lEntry) { - if (lEntry==null) - return null; - return getDocuments(lEntry.startOffset, - lEntry.startBitOffset, - lEntry.endOffset, - lEntry.endBitOffset, lEntry.n_t); - } - /** - * Returns a two dimensional array containing the document ids, term - * frequencies and field scores for the given documents. - * @return int[][] the two dimensional [3][n] array containing the n - * document identifiers, frequencies and field scores. If fields is not enabled, then size is [2][n]. - * @param termid the identifier of the term whose documents we are looking for. - */ - public int[][] getDocuments(int termid) { - LexiconEntry lEntry = lexicon.getLexiconEntry(termid); - if (lEntry == null) - return null; - return getDocuments(lEntry.startOffset, - lEntry.startBitOffset, - lEntry.endOffset, - lEntry.endBitOffset, lEntry.n_t); + public void print() + { + //TODO + throw new UnsupportedOperationException("InvIndex.print() is missing"); } -/** - * Returns a two dimensional array containing the document ids, term - * frequencies and field scores for the given documents. - * @return int[][] the two dimensional [3][n] array containing the n - * document identifiers, frequencies and field scores. If fields is not enabled, then size is [2][n]. - * @param sOffset start byte of the postings in the inverted file - * @param sBitOffset start bit of the postings in the inverted file - * @param eOffset end byte of the postings in the inverted file - * @param eBitOffset end bit of the postings in the inverted file - */ - public int[][] getDocuments(long sOffset, byte sBitOffset, long eOffset, byte eBitOffset, int df) { - - final byte startBitOffset = sBitOffset; - final long startOffset = sOffset; - final byte endBitOffset = eBitOffset; - final long endOffset = eOffset; + public int[][] getDocuments(BitIndexPointer pointer) { + if (pointer==null) + return null; final int fieldCount = FieldScore.FIELDS_COUNT; final boolean loadTagInformation = FieldScore.USE_FIELD_INFORMATION; - //int df = lexicon.getNt(); + final int count = pointer.getNumberOfEntries(); try{ + final BitIn file = this.file.readReset(pointer.getBytes(), pointer.getBits()); int[][] documentTerms = null; - final BitIn file = this.file.readReset(startOffset, startBitOffset, endOffset, endBitOffset); if (loadTagInformation) { //if there are tag information to process - documentTerms = new int[3][df]; + documentTerms = new int[3][count]; documentTerms[0][0] = file.readGamma() - 1; documentTerms[1][0] = file.readUnary(); documentTerms[2][0] = file.readBinary(fieldCount); - for (int i = 1; i < df; i++) { + for (int i = 1; i < count; i++) { documentTerms[0][i] = file.readGamma() + documentTerms[0][i - 1]; documentTerms[1][i] = file.readUnary(); documentTerms[2][i] = file.readBinary(fieldCount); } } else { //no tag information to process - documentTerms = new int[2][df]; + documentTerms = new int[2][count]; //new documentTerms[0][0] = file.readGamma() - 1; documentTerms[1][0] = file.readUnary(); - for(int i = 1; i < df; i++){ + for(int i = 1; i < count; i++){ documentTerms[0][i] = file.readGamma() + documentTerms[0][i - 1]; documentTerms[1][i] = file.readUnary(); } } + file.close(); return documentTerms; } catch (IOException ioe) { logger.error("Problem reading inverted index", ioe); return null; } - } - - -// public int[][] getDocuments(long sOffset, byte sBitOffset, long eOffset, byte eBitOffset) { -// -// final byte startBitOffset = sBitOffset; -// final long startOffset = sOffset; -// final byte endBitOffset = eBitOffset; -// final long endOffset = eOffset; -// final int fieldCount = FieldScore.FIELDS_COUNT; -// final boolean loadTagInformation = FieldScore.USE_FIELD_INFORMATION; -// -// /* Coding is done separately for with Fields and without Fields, to keep -// * if's out of loops. */ -// -// ArrayList temporaryTerms = null; //instantiate when we know roughly how big it should be -// int[][] documentTerms = null; -// file.readReset(startOffset, startBitOffset, endOffset, endBitOffset); -// //boolean hasMore = false; -// if (loadTagInformation) { //if there are tag information to process -// /* FIELD_LOAD_FACTOR provides a heuristical rough size need for the arraylist. */ -// /* could probably do a better optimisation by considering the number of fields.*/ -// //temporaryTerms = new ArrayList((int)((endOffset-startOffset)*FIELD_LOAD_FACTOR)); -// TIntArrayList temporaryDocids = new TIntArrayList((int)((endOffset-startOffset)*NORMAL_LOAD_FACTOR)); -// TIntArrayList temporaryTFs = new TIntArrayList((int)((endOffset-startOffset)*NORMAL_LOAD_FACTOR)); -// TIntArrayList temporaryFields = new TIntArrayList((int)((endOffset-startOffset)*NORMAL_LOAD_FACTOR)); -// int previousDocid = -1; -// -// while (((file.getByteOffset() + startOffset) < endOffset) -// || (((file.getByteOffset() + startOffset) == endOffset) && (file -// .getBitOffset() < endBitOffset))) { -// //read document ID -// temporaryDocids.add(previousDocid = file.readGamma() + previousDocid); -// //read document frequency -// temporaryTFs.add(file.readUnary()); -// //read fields bitset (fieldCount bits long) -// temporaryFields.add(file.readBinary(fieldCount)); -// -// /*int[] tmp = new int[3]; -// //read documnent ID -// tmp[0] = file.readGamma(); -// //read document frequency -// tmp[1] = file.readUnary(); -// //read fields bitset (fieldCount bits long) -// tmp[2] = file.readBinary(fieldCount); -// temporaryTerms.add(tmp);*/ -// } -// final int postingsListSize = temporaryDocids.size(); -// documentTerms = new int[3][postingsListSize]; -// temporaryDocids.toNativeArray(documentTerms[0], 0, postingsListSize); -// temporaryTFs.toNativeArray(documentTerms[1], 0, postingsListSize); -// temporaryFields.toNativeArray(documentTerms[2], 0, postingsListSize); -// /* -// documentTerms = new int[3][temporaryTerms.size()]; -// int[] tmpDocumentTerms0 = documentTerms[0]; -// int[] tmpDocumentTerms1 = documentTerms[1]; -// int[] tmpDocumentTerms2 = documentTerms[2]; -// tmpDocumentTerms0[0] = ((int[]) temporaryTerms.get(0))[0] - 1; -// tmpDocumentTerms1[0] = ((int[]) temporaryTerms.get(0))[1]; -// tmpDocumentTerms2[0] = ((int[]) temporaryTerms.get(0))[2]; -// if (documentTerms[0].length > 1) { -// for (int i = 1; i < documentTerms[0].length; i++) { -// int[] tmpMatrix = (int[]) temporaryTerms.get(i); -// tmpDocumentTerms0[i] = tmpMatrix[0] + documentTerms[0][i - 1]; -// tmpDocumentTerms1[i] = tmpMatrix[1]; -// tmpDocumentTerms2[i] = tmpMatrix[2]; -// } -// } -// */ -// } else { //no tag information to process -// -// /* NORMAL_LOAD_FACTOR provides a heuristical rough size need for the arraylist */ -// TIntArrayList temporaryDocids = new TIntArrayList((int)((endOffset-startOffset)*NORMAL_LOAD_FACTOR)); -// TIntArrayList temporaryTFs = new TIntArrayList((int)((endOffset-startOffset)*NORMAL_LOAD_FACTOR)); -// //temporaryTerms = new ArrayList((int)((endOffset-startOffset)*NORMAL_LOAD_FACTOR)); -// -// int previousDocid = -1; -// while (((file.getByteOffset() + startOffset) < endOffset) -// || (((file.getByteOffset() + startOffset) == endOffset) && (file -// .getBitOffset() < endBitOffset))) { -// //read document ID -// temporaryDocids.add(previousDocid = file.readGamma() + previousDocid); -// //read document frequency -// temporaryTFs.add(file.readUnary()); -// //int[] tmp = new int[2]; -// //read document ID -// //tmp[0] = file.readGamma(); -// //read document frequency -// //tmp[1] = file.readUnary(); -// //temporaryTerms.add(tmp); -// } -// -// final int postingsListSize = temporaryDocids.size(); /*temporaryTerms.size()*/ -// documentTerms = new int[2][postingsListSize]; -// temporaryDocids.toNativeArray(documentTerms[0], 0, postingsListSize); -// temporaryTFs.toNativeArray(documentTerms[1], 0, postingsListSize); -// //int last = -1; -// //int[] tmpDocumentTerms0 = documentTerms[0]; -// //for(int i=0;i 1) { -// // for (int i = 1; i < documentTerms[0].length; i++) { -// // last = tmpDocumentTerms0[i] = temporaryDocids.get(i) + last; -// // tmpDocumentTerms1[i] = temporaryTFs.get(i); -// //int[] tmpMatrix = (int[]) temporaryTerms.get(i); -// //tmpDocumentTerms0[i] = tmpMatrix[0] + documentTerms[0][i - 1]; -// //tmpDocumentTerms1[i] = tmpMatrix[1]; -// // } -// //} -// } -// //System.out.println((endOffset-startOffset)+" , "+temporaryTerms.size()); -// return documentTerms; -// } - /* * - * Returns a five dimensional array containing the document ids, - * the term frequencies, the field scores the block frequencies and - * the block ids for the given documents. The returned postings are - * for the documents within a specified range of docids. - * @return int[][] the five dimensional [5][] array containing - * the document ids, frequencies, field scores and block - * frequencies, while the last vector contains the - * block identifiers and it has a different length from - * the document identifiers. - * @param termid the id of the term whose documents we are looking for. - * @param startDocid The starting docid that will be returned. - * @param endDocid The last possible docid that will be returned. - */ - /*public int[][] getDocuments(int termid, int startDocid, int endDocid) { - // Coding is done separately for with Fields and without Fields, to keep - if's out of loops. - boolean found = lexicon.findTerm(termid); - if (!found) - return null; - - byte startBitOffset = lexicon.getStartBitOffset(); - long startOffset = lexicon.getStartOffset(); - byte endBitOffset = lexicon.getEndBitOffset(); - long endOffset = lexicon.getEndOffset(); - final int fieldCount = FieldScore.FIELDS_COUNT; - final boolean loadTagInformation = FieldScore.USE_FIELD_INFORMATION; - ArrayList temporaryTerms = null; //instantiate when we know roughly how big it should be - int[][] documentTerms = null; - try{ - final BitIn file = this.file.readReset(startOffset, startBitOffset, endOffset, endBitOffset); - //boolean hasMore = false; - if (loadTagInformation) { //if there are tag information to process - // FIELD_LOAD_FACTOR provides a heuristical rough size need for the arraylist. - // could probably do a better optimisation by considering the number of fields. - temporaryTerms = new ArrayList((int)((endOffset-startOffset)*FIELD_LOAD_FACTOR)); - while (((file.getByteOffset() + startOffset) < endOffset) - || (((file.getByteOffset() + startOffset) == endOffset) && (file - .getBitOffset() < endBitOffset))) { - int[] tmp = new int[3]; - //read documnent ID - tmp[0] = file.readGamma(); - //read document frequency - tmp[1] = file.readUnary(); - //read fields bitset (fieldCount bits long) - tmp[2] = file.readBinary(fieldCount); - if (tmp[0]>=startDocid && tmp[0]<=endDocid) - temporaryTerms.add(tmp); - } - documentTerms = new int[3][temporaryTerms.size()]; - int[] tmpDocumentTerms0 = documentTerms[0]; - int[] tmpDocumentTerms1 = documentTerms[1]; - int[] tmpDocumentTerms2 = documentTerms[2]; - tmpDocumentTerms0[0] = ((int[]) temporaryTerms.get(0))[0] - 1; - tmpDocumentTerms1[0] = ((int[]) temporaryTerms.get(0))[1]; - tmpDocumentTerms2[0] = ((int[]) temporaryTerms.get(0))[2]; - if (documentTerms[0].length > 1) { - for (int i = 1; i < documentTerms[0].length; i++) { - int[] tmpMatrix = (int[]) temporaryTerms.get(i); - tmpDocumentTerms0[i] = tmpMatrix[0] + documentTerms[0][i - 1]; - tmpDocumentTerms1[i] = tmpMatrix[1]; - tmpDocumentTerms2[i] = tmpMatrix[2]; - } - } - } else { //no tag information to process - - //NORMAL_LOAD_FACTOR provides a heuristical rough size need for the arraylist - temporaryTerms = new ArrayList((int)((endOffset-startOffset)*NORMAL_LOAD_FACTOR)); - while (((file.getByteOffset() + startOffset) < endOffset) - || (((file.getByteOffset() + startOffset) == endOffset) && (file - .getBitOffset() < endBitOffset))) { - int[] tmp = new int[2]; - //read document ID - tmp[0] = file.readGamma(); - //read document frequency - tmp[1] = file.readUnary(); - temporaryTerms.add(tmp); - } - documentTerms = new int[2][temporaryTerms.size()]; - int[] tmpDocumentTerms0 = documentTerms[0]; - int[] tmpDocumentTerms1 = documentTerms[1]; - tmpDocumentTerms0[0] = ((int[]) temporaryTerms.get(0))[0] - 1; - tmpDocumentTerms1[0] = ((int[]) temporaryTerms.get(0))[1]; - if (documentTerms[0].length > 1) { - for (int i = 1; i < documentTerms[0].length; i++) { - int[] tmpMatrix = (int[]) temporaryTerms.get(i); - tmpDocumentTerms0[i] = tmpMatrix[0] + documentTerms[0][i - 1]; - tmpDocumentTerms1[i] = tmpMatrix[1]; - } - } - } - } - catch (IOException ioe) { - logger.error("Problem reading inverted index", ioe); - return null; - } - - return documentTerms; - }*/ - - /** - * Returns the information for a posting list in string format - */ - public String getInfo(int term) { - StringBuilder info = new StringBuilder(); - int[][] documents = getDocuments(term); - if (useFieldInformation) { - for (int j = 0; j < documents[0].length; j++) { - info.append("("); - info.append(documents[0][j]); - info.append(","); - info.append(documents[1][j]); - info.append(","); - info.append(documents[2][j]); - info.append(")"); - } - } else { - for (int j = 0; j < documents[0].length; j++) { - info.append("("); - info.append(documents[0][j]); - info.append(","); - info.append(documents[1][j]); - info.append(")"); - } - } - return info.toString(); - } + } /** diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/InvertedIndexInputStream.java src/uk/ac/gla/terrier/structures/InvertedIndexInputStream.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/InvertedIndexInputStream.java 2009-01-28 20:16:55.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/InvertedIndexInputStream.java 2009-03-03 14:34:49.000000000 +0000 @@ -27,11 +27,12 @@ package uk.ac.gla.terrier.structures; import java.io.IOException; +import java.util.Iterator; +import java.util.Map; import uk.ac.gla.terrier.compression.BitIn; import uk.ac.gla.terrier.compression.BitInputStream; import uk.ac.gla.terrier.compression.OldBitInputStream; -import uk.ac.gla.terrier.utility.ApplicationSetup; import uk.ac.gla.terrier.utility.FieldScore; @@ -43,7 +44,7 @@ public class InvertedIndexInputStream implements Closeable,LegacyBitFileStructure { /** the lexicon input stream providing the offsets */ - protected final LexiconInputStream lis; + protected final Iterator> lis; /** The gamma compressed file containing the terms. */ protected BitIn file; /** filename of the underlying bitfile */ @@ -52,21 +53,10 @@ /** Indicates whether field information is used.*/ final boolean useFieldInformation = FieldScore.USE_FIELD_INFORMATION; - public InvertedIndexInputStream(String path, String prefix, LexiconInputStream lis) throws IOException + public InvertedIndexInputStream(Index _index, String structureName, Iterator> positions) throws IOException { - this(path + ApplicationSetup.FILE_SEPARATOR + prefix + ApplicationSetup.IFSUFFIX, lis); - } - - public InvertedIndexInputStream(String filename, LexiconInputStream lis) throws IOException - { - file = new BitInputStream(this.filename = filename); - this.lis = lis; - } - - public InvertedIndexInputStream(BitIn invFile, LexiconInputStream lis) throws IOException - { - file = invFile; - this.lis = lis; + file = new BitInputStream(_index.getPath() + "/" + _index.getPrefix() +"."+ structureName +".bf"); + this.lis = positions; } /** forces the data structure to reopen the underlying bitfile @@ -80,15 +70,17 @@ } public int[][] getNextDocuments() throws IOException { - int rtrLis = lis.readNextEntry(); - if (rtrLis < 0) + if (! lis.hasNext()) return null; - return getNextDocuments(lis.getNt(), lis.getEndOffset(), lis.getEndBitOffset()); + + return getNextDocuments(lis.next().getValue()); } - protected int[][] getNextDocuments(int df, long endByteOffset, byte endBitOffset) throws IOException { + protected int[][] getNextDocuments(BitIndexPointer pointer) throws IOException { int[][] documentTerms = null; final int fieldCount = FieldScore.FIELDS_COUNT; + System.out.println("term"+ ((TermStatistics)pointer).getTermId() + " has Nt="+pointer.getNumberOfEntries() ); + final int df = pointer.getNumberOfEntries(); if (useFieldInformation) { //if there are tag information to process documentTerms = new int[3][df]; documentTerms[0][0] = file.readGamma() - 1; @@ -118,7 +110,7 @@ try{ while((documents = getNextDocuments()) != null) { - System.out.print("tid"+i); + System.out.print((i++)+"th term: "); if (useFieldInformation) { for (int j = 0; j < documents[0].length; j++) { System.out.print("(" + documents[0][j] + ", " + documents[1][j] @@ -140,6 +132,7 @@ public void close() { try{ file.close(); } catch (IOException ioe) {} - lis.close(); + if (lis instanceof Closeable) + ((Closeable)lis).close(); } } diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/JDBMHashtable.java src/uk/ac/gla/terrier/structures/JDBMHashtable.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/JDBMHashtable.java 2007-06-08 15:29:54.000000000 +0100 +++ src/uk/ac/gla/terrier/structures/JDBMHashtable.java 1970-01-01 01:00:00.000000000 +0100 @@ -1,129 +0,0 @@ -package uk.ac.gla.terrier.structures; -import java.util.Enumeration; -import java.io.IOException; - -import org.apache.log4j.Logger; - -import jdbm.RecordManager; -import jdbm.RecordManagerFactory; -import jdbm.helper.FastIterator; -import jdbm.htree.HTree; -public class JDBMHashtable implements PersistentHashtable -{ - /** The logger used */ - private static Logger logger = Logger.getRootLogger(); - private RecordManager database; - private HTree table; - private static final String TABLENAME = "JDBMHashtable"; - private static final boolean AutoCommit = false; - protected String filename; - public JDBMHashtable(String Filename) - { - filename = Filename; - try{ - database = RecordManagerFactory.createRecordManager(filename); - long recid = database.getNamedObject(TABLENAME); - if (recid==0) - { - table = HTree.createInstance(database); - database.setNamedObject(TABLENAME, table.getRecid()); - } - else - { - table = HTree.load(database, recid); - } - }catch (IOException ioe){ - logger.fatal("Couldn't open a JDBM : ",ioe); - } - } - public void clear() - { - //try{ - //TODO - //}catch (IOException ioe){ - // System.err.println("Couldn't clear JDBM : "+ioe); - //} - } - public boolean containsKey(String key) - { - try{ - return table.get(key) != null; - }catch(IOException ioe){ - return false; - } - } - public boolean equals(Object o) - { - if (! (o instanceof JDBMHashtable)) - return false; - return ((JDBMHashtable)o).filename.equals(filename); - } - public String get(String key) - { - try{ - return (String)table.get(key); - }catch(IOException ioe){ - logger.error("JDBM problem : ",ioe); - return null; - } - } - public boolean isEmpty() - { - return size() == 0; - } - public void put(String key, String value) - { - try{ - table.put(key, value); - if (AutoCommit) database.commit(); - }catch (IOException ioe){ - logger.error("JDBM problem : ",ioe); - } - } - public void remove(String key) - { - try{ - table.remove(key); - if (AutoCommit) database.commit(); - }catch (IOException ioe){ - logger.error("JDBM problem : ",ioe); - } - } - public int size() - { - int count =0; - try{ - FastIterator keys = table.keys(); - while(keys.next() != null) - { - count ++; - } - }catch(Exception e){} - return count; - } - public Enumeration keys() - { - return null;//return database.keys(); - } - public Enumeration values() - { - return null;//return database.elements(); - } - public void close() - { - try{ - database.commit(); - database.close(); - }catch (IOException ioe){ - logger.error("Failed to commit changes to JDBM : ",ioe); - } - } - - public void commit() { - try { - database.commit(); - } catch (IOException ioe){ - logger.error("Failed to commit changes to JDBM : ",ioe); - } - } -} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/Lexicon.java src/uk/ac/gla/terrier/structures/Lexicon.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/Lexicon.java 2009-01-28 20:16:55.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/Lexicon.java 2009-03-03 14:34:49.000000000 +0000 @@ -1,654 +1,61 @@ -/* - * Terrier - Terabyte Retriever - * Webpage: http://ir.dcs.gla.ac.uk/terrier - * Contact: terrier{a.}dcs.gla.ac.uk - * University of Glasgow - Department of Computing Science - * http://www.gla.ac.uk/ - * - * The contents of this file are subject to the Mozilla Public License - * Version 1.1 (the "License"); you may not use this file except in - * compliance with the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. - * - * The Original Code is Lexicon.java. - * - * The Original Code is Copyright (C) 2004-2009 the University of Glasgow. - * All Rights Reserved. - * - * Contributor(s): - * Gianni Amati (original author) - * Vassilis Plachouras - */ package uk.ac.gla.terrier.structures; -import gnu.trove.TIntObjectHashMap; -import java.io.ByteArrayInputStream; -import java.io.DataInputStream; -import java.io.File; -import java.io.IOException; -import java.io.ObjectInputStream; -import java.util.Iterator; - -import uk.ac.gla.terrier.utility.ApplicationSetup; -import uk.ac.gla.terrier.utility.Files; -import uk.ac.gla.terrier.utility.io.RandomDataInput; -import uk.ac.gla.terrier.utility.io.RandomDataOutput; - -import org.apache.log4j.Logger; -/** - * The class that implements the lexicon structure. Apart from the lexicon file, - * which contains the actual data about the terms, and takes its name from - * ApplicationSetup.LEXICON_FILENAME, another file is created and - * used, containing a mapping from the term's code to the offset of the term - * in the lexicon. The name of this file is given by - * ApplicationSetup.LEXICON_INDEX_FILENAME. - * - * @see ApplicationSetup#LEXICON_FILENAME - * @see ApplicationSetup#LEXICON_INDEX_FILENAME - * @author Gianni Amati, Vassilis Plachouras - * @version $Revision: 1.47 $ - */ -public class Lexicon implements Iterable, Closeable{ - /** The logger used for the Lexicon */ - protected Logger logger = Logger.getRootLogger(); - - /** The term represented as an array of bytes.*/ - protected byte[] termCharacters; - - /** The term represented as a string.*/ - protected String term; - - /** An integer representing the id of the term.*/ - protected int termId; - - /** The document frequency of the term.*/ - protected int documentFrequency; - - /** The term frequency of the term.*/ - protected int termFrequency; - - /** The offset in bytes in the inverted file of the term.*/ - protected long startOffset; - - /** The offset in bits in the starting byte in the inverted file.*/ - protected byte startBitOffset; - - /** The offset in bytes in the inverted file of the term.*/ - protected long endOffset; - - /** The offset in bits in the ending byte in the inverted file.*/ - protected byte endBitOffset; - - /** - * The size in bytes of an entry in the lexicon file. - * An entry corresponds to a string, an int (termCode), - * an int (docf), an int (tf), a long (the offset of the end - * of the term's entry in bytes in the inverted file) and - * a byte (the offset in bits of the last byte of the term's entry - * in the inverted file. - */ - public static final int lexiconEntryLength = - ApplicationSetup.STRING_BYTE_LENGTH //the string representation - +12 //the three integers - +8 //the long - +1; //the byte - - /** The file containing the mapping from the codes to the offset in the lexicon file.*/ - protected RandomDataInput idToOffsetFile; - - /** The actual lexicon file.*/ - protected RandomDataInput lexiconFile; - - /** Filename of the of lexicon file opened */ - protected String lexiconFileName; - - /** The number of entries in the lexicon file.*/ - protected int numberOfLexiconEntries; - - /** A buffer for reading from the lexicon file.*/ - protected byte[] buffer = new byte[512]; - - /** A second buffer for finding terms.*/ - protected byte[] bt = new byte[ApplicationSetup.STRING_BYTE_LENGTH]; - - /** A byte input stream to read from the buffer.*/ - protected ByteArrayInputStream bufferInput = new ByteArrayInputStream(buffer); - - /** A data input stream to read from the bufferInput.*/ - protected DataInputStream dataInput = new DataInputStream(bufferInput); - - /** - * A hashmap that is used in order to reduce the number - * of random accesses on disk during the binary search - */ - protected TIntObjectHashMap map = null; - - /** Controls whether to use the hash for speeding up - * lexicon entry lookups or not. The corresponding - * property is lexicon.use.hash. - */ - protected boolean USE_HASH = Boolean.parseBoolean(ApplicationSetup.getProperty("lexicon.use.hash","true")); - - protected Class inputStreamClass = LexiconInputStream.class; - - /** Contructor for child classes which dont want to open a file */ - protected Lexicon(long a, long b, long c) {} - - /** - * A default constructor. - */ - public Lexicon() { - this(ApplicationSetup.LEXICON_FILENAME); - } - - public Lexicon(String path, String prefix) - { - this(path + ApplicationSetup.FILE_SEPARATOR + prefix + ApplicationSetup.LEXICONSUFFIX); - } - - /** - * Constructs an instace of Lexicon and opens - * the corresponding file. - * - * @param lexiconName the name of the lexicon file. - */ - public Lexicon(String lexiconName) { - boolean updateable = false; - try { - lexiconFile = updateable - ? Files.writeFileRandom(this.lexiconFileName = lexiconName) - : Files.openFileRandom(this.lexiconFileName = lexiconName); - idToOffsetFile = Files.openFileRandom(lexiconName.substring(0,lexiconName.lastIndexOf(".")).concat(ApplicationSetup.LEXICON_INDEX_SUFFIX)); - numberOfLexiconEntries = (int) (lexiconFile.length() / (long)lexiconEntryLength); - - if (USE_HASH) { - try{ - String hashFilename = lexiconName.substring(0,lexiconName.lastIndexOf(".")).concat(ApplicationSetup.LEXICON_HASH_SUFFIX); - ObjectInputStream ois = new ObjectInputStream(Files.openFileStream(hashFilename)); - map = (TIntObjectHashMap)ois.readObject(); - ois.close(); - } - catch (IOException ioe) { - logger.warn("Input/output exception while reading the hashmap used for the lexicon. Hash will not be used." + ioe); - USE_HASH = false; - } catch (ClassNotFoundException cnfe) { - logger.warn("ClassNotFoundException while reading the hashmap used for the lexicon. Hash will not be used." + cnfe); - USE_HASH = false; - } - }//USE_HASH - } catch (IOException ioe) { - logger.error("Input/output exception while opening for reading the lexicon file: " + ioe); - } - - } - - /** - * Closes the lexicon and lexicon index files. - */ - public void close() { - try { - idToOffsetFile.close(); - lexiconFile.close(); - } catch (IOException ioe) { - logger.error("Input/output exception while closing the lexicon file: " + ioe); - } - } - - /** - * Prints out the contents of the lexicon file. - * Streams are used to read the lexicon file. - */ - public void print() { - LexiconInputStream tmp=null; - try{ - tmp = (LexiconInputStream)inputStreamClass.getConstructor(String.class).newInstance(this.lexiconFileName); - } catch (Exception e) {logger.error(e); return;} - final LexiconInputStream _lis=tmp; - _lis.print(); - } - - /** - * Finds the term given its term code. - * - * @return true if the term is found, else return false - * @param _termId the term's identifier - */ - public boolean findTerm(int _termId) { - try { - idToOffsetFile.seek((long)_termId * 8L); - return seekEntry((int) (idToOffsetFile.readLong()/(long)lexiconEntryLength)); - } catch(IOException ioe) { - logger.error("Input/Output exception while reading the lexicon index file for termid "+_termId+": ", ioe); - } - return false; - - } - /** - * Performs a binary search in the lexicon - * in order to locate the given term. - * If the term is located, the properties - * termCharacters, documentFrequency, - * termFrequency, startOffset, startBitOffset, - * endOffset and endBitOffset contain the - * values related to the term. - * @param _term The term to search for. - * @return true if the term is found, and false otherwise. - */ - public boolean findTerm(String _term) { - int low = -1; - int high = numberOfLexiconEntries; - int i; - int compareStrings; - - if (USE_HASH) { - int firstChar = _term.charAt(0); - int[] boundaries = (int[])map.get(firstChar); - low = boundaries[0]; - high = boundaries[1]; - - } - - //if (logger.isDebugEnabled()) - // logger.debug("lexicon hash low high for term " + _term + " are: " + low + " " + high); - - try { - while (high-low>1) { - - i = (high + low)/2; - - lexiconFile.seek((long)i * (long)lexiconEntryLength); - lexiconFile.readFully(buffer, 0, lexiconEntryLength); - term = new String(buffer,0,ApplicationSetup.STRING_BYTE_LENGTH).trim(); - - if ((compareStrings = _term.compareTo(term))< 0) - high = i; - else if (compareStrings > 0) - low = i; - else { - seekEntry(i); - return true; - } - - - } - } catch(IOException ioe) { - logger.fatal("IOException while binary searching the lexicon: " + ioe); - } - - if (high == numberOfLexiconEntries) - return false; - - seekEntry(high); - if (_term.compareTo(term) == 0) - return true; - return false; - } - - /** - * Returns the bit offset in the last byte of - * the term's entry in the inverted file. - * @deprecated - * @return byte the bit offset in the last byte of - * the term's entry in the inverted file - */ - public byte getEndBitOffset() { - return endBitOffset; - } - /** - * Returns the ending offset of the term's entry in the inverted file. - * @deprecated - * @return long The ending byte of the term's entry in the inverted file. - */ - public long getEndOffset() { - return endOffset; - } - /** - * Return the document frequency for the given term. - * @deprecated - * @return int The document frequency for the given term - */ - public int getNt() { - return documentFrequency; - } - /** - * Returns the number of entries in the lexicon. - * @return the number of entries in the lexicon. - * @deprecated - */ - public long getNumberOfLexiconEntries() { - return numberOfLexiconEntries; - } - /** - * The bit offset in the starting byte of - * the entry in the inverted file. - * @deprecated - * @return byte The number of bits in the first - * byte of the entry in the inverted file - */ - public byte getStartBitOffset() { - return startBitOffset; - } - /** - * Returns the beginning of the term's entry in the inverted file. - * @deprecated - * @return long the start offset (in bytes) in the inverted file - */ - public long getStartOffset() { - return startOffset; - } - /** - * Insert the method's description here. - * @deprecated - * @return java.lang.String The string representation of the seeked term. - */ - public String getTerm() { - return this.term.trim(); - } - /** - * Returns the term's id. - * @deprecated - * @return int the term's id. - */ - public int getTermId() { - return termId; - } - /** - * Returns the term frequency for the already seeked term. - * - * @return int The term frequency in the collection. - * @deprecated - */ - public int getTF() { - return termFrequency; - } - /** - * Seeks the i-th entry of the lexicon. - * TODO read a byte array from the file and decode it, - * instead of reading the different pieces of - * information separately. - * @param i The index of the entry we are looking for. - * @return true if the entry was found, false otherwise. - */ - public boolean seekEntry(int i) { - try { - if (i >= numberOfLexiconEntries || i < 0) - return false; - else { - if (i == 0) { - lexiconFile.seek(0); - startOffset = 0; - startBitOffset = 0; - lexiconFile.readFully(buffer, 0, lexiconEntryLength); - dataInput.reset(); - term = new String(buffer,0,ApplicationSetup.STRING_BYTE_LENGTH).trim(); - } else { - lexiconFile.seek((i-1) * (long)lexiconEntryLength + (long)(ApplicationSetup.STRING_BYTE_LENGTH + 12)); - lexiconFile.readFully(buffer, 0, lexiconEntryLength + 9); - dataInput.reset(); - startOffset = dataInput.readLong(); - startBitOffset = dataInput.readByte(); - if (++startBitOffset == 8) { - startBitOffset = 0; - startOffset++; - } - term = new String(buffer, 9, ApplicationSetup.STRING_BYTE_LENGTH).trim(); - } - dataInput.skipBytes(ApplicationSetup.STRING_BYTE_LENGTH); - termId = dataInput.readInt(); - documentFrequency = dataInput.readInt(); - termFrequency = dataInput.readInt(); - endOffset = dataInput.readLong(); - endBitOffset = dataInput.readByte(); - return true; - } - } catch (IOException ioe) { - logger.error("Input/Output exception while reading the idToOffset file. ", ioe); - } - return false; - } - - - /** - * In an already stored entry in the lexicon - * file, the information about the term frequency, - * the endOffset in bytes, and the endBitOffset in the last - * byte, is updated. The term is specified by the index of the entry. - * - * @return true if the information is updated properly, - * otherwise return false - * @param i the i-th entry - * @param frequency the term's Frequency - * @param endOffset the offset of the ending byte in the inverted file - * @param endBitOffset the offset in bits in the ending byte - * in the term's entry in inverted file - * @deprecated The Lexicon class is only used for reading the - * lexicon file, and not for writing any information. - */ - public boolean updateEntry( - int i, - int frequency, - long endOffset, - byte endBitOffset) { - - if (! (lexiconFile instanceof RandomDataOutput)) - return false; - RandomDataOutput _lexiconFile = (RandomDataOutput)lexiconFile; - try { - long lexiconOffset = (long)i * (long)lexiconEntryLength; - //we seek the offset where the frequency should be writen - _lexiconFile.seek( - lexiconOffset + ApplicationSetup.STRING_BYTE_LENGTH + 8); - _lexiconFile.writeInt(frequency); - _lexiconFile.writeLong(endOffset); - _lexiconFile.writeByte(endBitOffset); - } catch (IOException ioe) { - logger.error("Input/Output exception while writing to the lexicon file. ", ioe); - } - return false; - } - - - /** Returns the number of entries in the lexicon file specified by f. - * @param f The file to find the number of entries in - */ - public static int numberOfEntries(File f) { - return (int) ( f.length()/(long)lexiconEntryLength ); - } - - /** Returns the number of entries in the lexicon file specified by filename. - * @param filename - */ - public static int numberOfEntries(String filename) { - return numberOfEntries(new File(filename)); - } - - - /** Returns a LexiconEntry describing all the information in the lexicon about the ith term - * in the lexicon. - * @param termNumber The ith term in the lexicon. i is 0-based, and runs to getNumberOfLexiconEntries()-1 - * @return LexiconEntry all information about the term's entry in the lexicon. null if termid not found - */ - public LexiconEntry getIthLexiconEntry(int termNumber) { - if (! seekEntry(termNumber)) - return null; - LexiconEntry le = new LexiconEntry(); - le.termId = this.termId; - le.term = this.term.trim(); - le.TF = this.termFrequency; - le.n_t = this.documentFrequency; - le.startOffset = this.startOffset; - le.startBitOffset = this.startBitOffset; - le.endOffset = this.endOffset; - le.endBitOffset = this.endBitOffset; - return le; - } - - /** Returns a LexiconEntry describing all the information in the lexicon about the term - * denoted by termid - * @param termid the termid of the term of interest - * @return LexiconEntry all information about the term's entry in the lexicon. null if termid not found */ - public LexiconEntry getLexiconEntry(int termid) { - /* TODO: improve this to the effectiveness level of getLexiconEntry() */ - if (! findTerm(termid)) - return null; - LexiconEntry le = new LexiconEntry(); - le.termId = this.termId; - le.term = this.term.trim(); - le.TF = this.termFrequency; - le.n_t = this.documentFrequency; - le.startOffset = this.startOffset; - le.startBitOffset = this.startBitOffset; - le.endOffset = this.endOffset; - le.endBitOffset = this.endBitOffset; - return le; - } - - /** Returns a LexiconEntry describing all the information in the lexicon about the term - * denoted by _term - * @param _term the String term that is of interest - * @return LexiconEntry all information about the term's entry in the lexicon. null if termid not found */ - public LexiconEntry getLexiconEntry(String _term) { - int low = -1; - int high = numberOfLexiconEntries; - int i; - int compareStrings; - String term; - byte[] buffer = new byte[lexiconEntryLength+9]; //to get the start offsets as well - - if (USE_HASH) { - int firstChar = _term.charAt(0); - int[] boundaries = (int[])map.get(firstChar); - if (boundaries != null) - { - low = boundaries[0]; - high = boundaries[1]; - } - //System.out.println("lexicon use hash: " + low + " " + high); - } - - try { - while (high-low>1) { - - i = (high + low)/2; - if (i==0) { - lexiconFile.seek(0); - lexiconFile.readFully(buffer, 0, lexiconEntryLength); - term = new String(buffer,0,ApplicationSetup.STRING_BYTE_LENGTH).trim(); - } else { - lexiconFile.seek((long)i * (long)(lexiconEntryLength)-9L); - lexiconFile.readFully(buffer, 0, lexiconEntryLength+9); - term = new String(buffer,9,ApplicationSetup.STRING_BYTE_LENGTH).trim(); - } - - if ((compareStrings = _term.compareTo(term))< 0) - high = i; - else if (compareStrings > 0) - low = i; - else { //read the rest and return the data - return getLexiconEntryFromBuffer(buffer, term, i); - } - } - - if (high == numberOfLexiconEntries) - return null; - - if (high == 0) { - lexiconFile.seek(0); - lexiconFile.readFully(buffer, 0, lexiconEntryLength); - term = new String(buffer,0,ApplicationSetup.STRING_BYTE_LENGTH).trim(); - } else { - lexiconFile.seek((long)high * (long)(lexiconEntryLength)-9L); - lexiconFile.readFully(buffer, 0, lexiconEntryLength+9); - term = new String(buffer,9,ApplicationSetup.STRING_BYTE_LENGTH).trim(); - } - - if (_term.compareTo(term) == 0) { - return getLexiconEntryFromBuffer(buffer, term, high); - } - } catch(IOException ioe) { - logger.fatal("IOException while binary searching the lexicon: " + ioe); - } - return null; - } - - protected LexiconEntry getLexiconEntryFromBuffer(byte[] buffer, String term, int index) { - int offset; - LexiconEntry lEntry = new LexiconEntry(); - lEntry.term = term; - if (index==0) { - lEntry.startOffset = 0; - lEntry.startBitOffset = 0; - offset = ApplicationSetup.STRING_BYTE_LENGTH; - } else { - offset = 0; -// lEntry.startOffset = -// (((((((buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | -// buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff; - - long startOffset = (buffer[offset++] & 0xff); - for (int j=0; j<7; j++) - startOffset = startOffset<<8 | (buffer[offset++] & 0xff); - lEntry.startOffset = startOffset; - - - lEntry.startBitOffset = (byte)(buffer[offset++]&0xff); - if (++lEntry.startBitOffset == 8) { - lEntry.startBitOffset = 0; - lEntry.startOffset++; - } - - offset += ApplicationSetup.STRING_BYTE_LENGTH; - } - lEntry.termId = - (((buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff; - lEntry.n_t = - (((buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff; - lEntry.TF = - (((buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff; - -// lEntry.endOffset = -// (((((((buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | -// buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff; - - long endOffset = (int)(buffer[offset++] & 0xff); - for (int j=0; j<7; j++) - endOffset = endOffset<<8 | (buffer[offset++] & 0xff); - lEntry.endOffset = endOffset; - - lEntry.endBitOffset = (byte)(buffer[offset]&0xff); - return lEntry; - } - - /** Returns an interator that gives every item in the lexicon, in lexical order. Underlying implementation is - * using a lexicon input stream */ - public Iterator iterator() - { - LexiconInputStream tmp=null; - try{ - tmp = (LexiconInputStream)inputStreamClass.getConstructor(String.class).newInstance(this.lexiconFileName); - } catch (Exception e) {logger.error(e);} - final LexiconInputStream _lis=tmp; - return new Iterator(){ - LexiconInputStream lis = _lis; - public boolean hasNext(){ - try{ - return lis.readNextEntry() != -1; - } catch (IOException ioe) { - logger.error(ioe); - return false; - } - } - public String next() - { - return lis.getTerm(); - } - public void remove() { throw new UnsupportedOperationException();} - }; - } -} +import java.util.Map; +public abstract class Lexicon implements Closeable, Iterable> +{ + static class LexiconFileEntry implements Map.Entry + { + KEY2 key; + LexiconEntry value; + + public LexiconFileEntry(KEY2 k, LexiconEntry v) + { + this.key = k; + this.value = v; + } + + public int hashCode() + { + LexiconFileEntry e = this; + return (e.getKey()==null ? 0 : e.getKey().hashCode()) ^ + (e.getValue()==null ? 0 : e.getValue().hashCode()); + } + + public LexiconEntry setValue(LexiconEntry v) + { + LexiconEntry old = value; + value = v; + return old; + } + + public KEY2 getKey() + { + return key; + } + + public LexiconEntry getValue() + { + return value; + } + + @SuppressWarnings("unchecked") + public boolean equals(Object o) + { + if (! (o instanceof Map.Entry)) + return false; + LexiconFileEntry e1 = this; + Map.Entry e2 = (Map.Entry)o; + return (e1.getKey()==null ? + e2.getKey()==null : e1.getKey().equals(e2.getKey())) && + (e1.getValue()==null ? + e2.getValue()==null : e1.getValue().equals(e2.getValue())); + } + } + + public abstract int numberOfEntries(); + public abstract LexiconEntry getLexiconEntry(KEY term); + public abstract Map.Entry getLexiconEntry(int termid); + public abstract Map.Entry getIthLexiconEntry(int index); + public abstract void close(); +} \ No newline at end of file diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/LexiconEntry.java src/uk/ac/gla/terrier/structures/LexiconEntry.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/LexiconEntry.java 2009-01-28 20:16:55.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/LexiconEntry.java 2009-03-03 14:34:49.000000000 +0000 @@ -1,86 +1,14 @@ - -/* - * Terrier - Terabyte Retriever - * Webpage: http://ir.dcs.gla.ac.uk/terrier - * Contact: terrier{a.}dcs.gla.ac.uk - * University of Glasgow - Department of Computing Science - * http://www.gla.ac.uk/ - * - * The contents of this file are subject to the Mozilla Public License - * Version 1.1 (the "License"); you may not use this file except in - * compliance with the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. - * - * The Original Code is BlockDirectIndex.java. - * - * The Original Code is Copyright (C) 2004-2009 the University of Glasgow. - * All Rights Reserved. - * - * Contributor(s): - * Vassilis Plachouras (original author) - * Craig Macdonald - */ package uk.ac.gla.terrier.structures; +import org.apache.hadoop.io.Writable; -/** Contains all the information about one entry in the Lexicon. - * Created to make thread-safe lookups in the Lexicon easier. */ -public class LexiconEntry { - - /** Create an empty LexiconEntry */ - public LexiconEntry(){} - - /** Create a lexicon entry with the following information. - * @param t the term - * @param tid the term id - * @param n_t the number of documents the term occurs in (document frequency) - * @param TF the total count of therm t in the collection - */ - public LexiconEntry(String t, int tid, int n_t, int TF) - { - this.term =t; - this.termId = tid; - this.n_t = n_t; - this.TF = TF; - } - - /** increment this lexicon entry by another */ - public void add(LexiconEntry le) - { - this.n_t += le.n_t; - this.TF += le.TF; - } - - /** alter this lexicon entry to subtract another lexicon entry */ - public void subtract(LexiconEntry le) - { - this.n_t -= le.n_t; - this.TF -= le.TF; - } - - /** the term of this entry */ - public String term; - /** the termid of this entry */ - public int termId; - /** the number of document that this entry occurs in */ - public int n_t; - /** the total number of occurrences of the term in the index */ - public int TF; - /** the start offset of the entry in the inverted index */ - public long startOffset; - /** the start bit offset of the entry in the inverted index */ - public byte startBitOffset; - /** the end offset of the entry in the inverted index */ - public long endOffset; - /** the end bit offset of the entry in the inverted index */ - public byte endBitOffset; +public abstract class LexiconEntry implements TermStatistics, BitIndexPointer, Writable +{ + + public String toString() + { + return '('+getDocumentFrequency()+","+getFrequency()+')' + +'@'+getBytes() + ',' + getBits(); + } - /** returns a string representation of this lexicon entry */ - public String toString() { - return term + " " + termId + " " + n_t + " " + TF + " " + startOffset + " " + startBitOffset + " " + endOffset + " " + endBitOffset; - } -} + public abstract void setTermId(int newTermId); +} \ No newline at end of file diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/LexiconInputStream.java src/uk/ac/gla/terrier/structures/LexiconInputStream.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/LexiconInputStream.java 2009-01-28 20:16:55.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/LexiconInputStream.java 1970-01-01 01:00:00.000000000 +0100 @@ -1,340 +0,0 @@ -/* - * Terrier - Terabyte Retriever - * Webpage: http://ir.dcs.gla.ac.uk/terrier - * Contact: terrier{a.}dcs.gla.ac.uk - * University of Glasgow - Department of Computing Science - * http://www.gla.ac.uk/ - * - * The contents of this file are subject to the Mozilla Public License - * Version 1.1 (the "License"); you may not use this file except in - * compliance with the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. - * - * The Original Code is LexiconInputStream.java. - * - * The Original Code is Copyright (C) 2004-2009 the University of Glasgow. - * All Rights Reserved. - * - * Contributor(s): - * Vassilis Plachouras (original author) - */ -package uk.ac.gla.terrier.structures; -import java.io.DataInput; -import java.io.DataInputStream; -import java.io.EOFException; -import java.io.File; -import java.io.IOException; -import java.util.Iterator; - -import org.apache.log4j.Logger; - -import uk.ac.gla.terrier.utility.ApplicationSetup; -import uk.ac.gla.terrier.utility.Files; -/** - * This class implements an input stream for the lexicon structure. - * @author Vassilis Plachouras - * @version $Revision: 1.36 $ - */ -public class LexiconInputStream implements Iterable, Closeable{ - /** The logger used for the Lexicon */ - protected Logger logger = Logger.getRootLogger(); - - /** The term represented as an array of bytes.*/ - protected byte[] termCharacters = - new byte[ApplicationSetup.STRING_BYTE_LENGTH]; - /** The term represented as a string.*/ - protected String term; - /** An integer representing the id of the term.*/ - protected int termId; - /** The document frequency of the term.*/ - protected int documentFrequency; - /** The term frequency of the term.*/ - protected int termFrequency; - /** The offset in bytes in the inverted file of the term.*/ - protected long endOffset; - /** The starting offset in bytes in the inverted file of the term.*/ - protected long startOffset; - /** The starting bit offset in the inverted file of the term.*/ - protected byte startBitOffset; - /** - * The offset in bits in the starting byte in the inverted file. - * Its initial value is -1 so that when we do startBitOffset = - * endBitOffset +1, the first startBitOffset is 0 - */ - protected byte endBitOffset = -1; - /** A data input stream to read from the bufferInput.*/ - protected DataInput lexiconStream = null; - /** The length of the lexicon file. */ - protected long lexiconFilelength; - /** size of one entry of the lexicon */ - protected int entrySize = 0; - - /** number of pointers read so far */ - protected long numPointersRead = 0; - /** number of tokens read so far */ - protected long numTokensRead = 0; - /** number of terms read so far */ - protected int numTermsRead = 0; - - /** A constructor for child classes that doesnt open the file */ - protected LexiconInputStream(long a, long b, long c) { } - - /** - * A default constructor. Opens the default lexicon. - */ - public LexiconInputStream() { - this(ApplicationSetup.LEXICON_FILENAME); - } - /** - * A constructor given the filename. - * @param filename java.lang.String the name of the lexicon file. - */ - public LexiconInputStream(String filename) { - try { - lexiconStream = new DataInputStream(Files.openFileStream(filename)); - this.lexiconFilelength = Files.length(filename); - } catch (IOException ioe) { - logger.fatal( - "I/O Exception occured while opening the lexicon file. Stack trace follows.",ioe); - } - entrySize = Lexicon.lexiconEntryLength; - } - - public LexiconInputStream(String path, String prefix) { - this(path + ApplicationSetup.FILE_SEPARATOR + prefix + ApplicationSetup.LEXICONSUFFIX); - } - /** - * A constructor given the filename. - * @param file java.io.File the name of the lexicon file. - */ - public LexiconInputStream(File file) { - try { - lexiconStream = new DataInputStream(Files.openFileStream(file)); - this.lexiconFilelength = Files.length(file); - } catch (IOException ioe) { - logger.fatal( - "I/O Exception occured while opening the lexicon file. Stack trace follows.",ioe); - } - entrySize = Lexicon.lexiconEntryLength; - } - - /** Read a lexicon from the specified input stream */ - public LexiconInputStream(DataInput in) { - lexiconStream = in; - this.lexiconFilelength = 0; - entrySize = Lexicon.lexiconEntryLength; - } - - /** - * Closes the lexicon stream. - * @throws IOException if an I/O error occurs - */ - public void close() { - try{ - if (lexiconStream instanceof java.io.Closeable) - ((java.io.Closeable)lexiconStream).close(); - } catch (IOException ioe){} - } - - public int getEntrySize() - { - return entrySize; - } - - /** - * Read the next lexicon entry. - * @return the number of bytes read if there is no error, - * otherwise returns -1 in case of EOF - * @throws java.io.IOException if an I/O error occurs - */ - public int readNextEntry() throws IOException { - try { - startBitOffset = (byte) (endBitOffset + 1); - startOffset = endOffset; - if (startBitOffset == 8) { - startOffset = endOffset + 1; - startBitOffset = 0; - } - lexiconStream.readFully( - termCharacters, - 0, - ApplicationSetup.STRING_BYTE_LENGTH); - - termId = lexiconStream.readInt(); - documentFrequency = lexiconStream.readInt(); - termFrequency = lexiconStream.readInt(); - endOffset = lexiconStream.readLong(); - endBitOffset = lexiconStream.readByte(); - numPointersRead += documentFrequency; - numTokensRead += termFrequency; - numTermsRead++; - return Lexicon.lexiconEntryLength; - } catch (EOFException eofe) { - return -1; - } - } - - /** This is an alias to readNextEntry(), except for implementations that - * cannot parse the string from the byte array. */ - public int readNextEntryBytes() throws IOException { - return readNextEntry(); - } - /** - * Returns the number of entries in the lexicon file. - */ - public int numberOfEntries(){ - return (int)(lexiconFilelength / Lexicon.lexiconEntryLength); - } - - /** - * Prints out the contents of the lexicon file to check. - */ - public void print() { - int i = 0; //counter - int entryLength = getEntrySize(); - System.err.println("LexOffset, Term, Termid, DF, TF, OffsetBy, OffsetBit"); - try { - while (readNextEntry() != -1) { - System.out.println( - "" - + ((long)i * (long)entryLength) - + ", " - + getTerm() - + ", " - + termId - + ", " - + documentFrequency - + ", " - + termFrequency - + ", " - + endOffset - + ", " - + endBitOffset); - i++; - } - } catch (IOException ioe) { - logger.fatal( - "Input/Output exception while reading the document index " + - "input stream. Stack trace follows.",ioe); - } - } - - /** Returns the number of pointers there would be in an inverted index built using this lexicon (thus far). - * This is equal to the sum of the Nts written to this lexicon output stream. */ - public long getNumberOfPointersRead() - { - return numPointersRead; - } - - /** Returns the number of tokens there are in the entire collection represented by this lexicon (thus far). - * This is equal to the sum of the TFs written to this lexicon output stream. */ - public long getNumberOfTokensRead() - { - return numTokensRead; - } - - /** Returns the number of terms written so far by this LexiconInputStream */ - public int getNumberOfTermsRead() - { - return numTermsRead; - } - - - /** - * Returns the bit offset in the last byte of - * the term's entry in the inverted file. - * @return byte the bit offset in the last byte of - * the term's entry in the inverted file - */ - public byte getEndBitOffset() { - return endBitOffset; - } - /** - * Returns the ending offset of the term's - * entry in the inverted file. - * @return long The ending byte of the term's - * entry in the inverted file. - */ - public long getEndOffset() { - return endOffset; - } - /** - * Returns the bit offset in the first byte - * of the term's entry in the inverted file. - * @return byte the bit offset in the first byte - * of the term's entry in the inverted file - */ - public byte getStartBitOffset() { - return startBitOffset; - } - /** - * Returns the starting offset of the term's - * entry in the inverted file. - * @return long The starting byte of the term's entry - * in the inverted file. - */ - public long getStartOffset() { - return startOffset; - } - /** - * Return the document frequency for the given term. - * @return int The document frequency for the given term - */ - public int getNt() { - return documentFrequency; - } - /** - * Returns the string representation of the term. - * @return the string representation of the already found term. - */ - public String getTerm() { - return (new String(termCharacters)).trim(); - } - /** - * Returns the term's id. - * @return the term's id. - */ - public int getTermId() { - return termId; - } - /** - * Returns the term frequency for the already seeked term. - * @return the term frequency in the collection. - */ - public int getTF() { - return termFrequency; - } - /** - * Returns the bytes of the String. - * @return the byte array holding the term's byte representation - */ - public byte[] getTermCharacters() { - return termCharacters; - } - - /** Returns an Interator of Strings of each term in this lexicon */ - public Iterator iterator() - { - return new Iterator(){ - public boolean hasNext(){ - try{ - return readNextEntry() != -1; - } catch (IOException ioe) { - logger.error(ioe); - return false; - } - } - public String next() - { - return getTerm(); - } - public void remove() { throw new UnsupportedOperationException();} - - }; - } -} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/LexiconOutputStream.java src/uk/ac/gla/terrier/structures/LexiconOutputStream.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/LexiconOutputStream.java 2009-01-28 20:16:55.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/LexiconOutputStream.java 2009-03-03 14:34:49.000000000 +0000 @@ -25,40 +25,14 @@ */ package uk.ac.gla.terrier.structures; import java.io.DataOutput; -import java.io.DataOutputStream; -import java.io.File; import java.io.IOException; - -import org.apache.log4j.Logger; - -import uk.ac.gla.terrier.utility.ApplicationSetup; -import uk.ac.gla.terrier.utility.Files; /** * This class implements an output stream for the lexicon structure. - * @author Vassilis Plachouras + * @author Vassilis Plachouras & Craig Macdonald * @version $Revision: 1.29 $ */ -public class LexiconOutputStream implements Closeable { - /** The logger used */ - private static Logger logger = Logger.getRootLogger(); - /** A zero buffer for writing to the file.*/ - protected final byte[] zeroBuffer = - new byte[ApplicationSetup.STRING_BYTE_LENGTH]; - /** The term represented as an array of bytes.*/ - protected final byte[] termCharacters = - new byte[ApplicationSetup.STRING_BYTE_LENGTH]; - /** The term represented as a string.*/ - protected String term; - /** An integer representing the id of the term.*/ - protected int termId; - /** The document frequency of the term.*/ - protected int documentFrequency; - /** The term frequency of the term.*/ - protected int termFrequency; - /** The offset in bytes in the inverted file of the term.*/ - protected long endOffset; - /** The offset in bits in the starting byte in the inverted file.*/ - protected byte endBitOffset; +public abstract class LexiconOutputStream implements Closeable { + /** A data input stream to read from the bufferInput.*/ protected DataOutput lexiconStream = null; /** Pointer written - the sum of the Nts */ @@ -67,57 +41,7 @@ protected long numTokensWritten = 0; protected int numTermsWritten = 0; - /** A constructor for child classes that doesnt open the file */ - protected LexiconOutputStream(long a, long b, long c) { } - - /** - * A default constructor. - */ - public LexiconOutputStream() { - try { - lexiconStream = new DataOutputStream(Files.writeFileStream(ApplicationSetup.LEXICON_FILENAME)); - } catch (IOException ioe) { - logger.fatal( - "I/O error occured while opening the lexicon file. Stack trace follows.",ioe); - } - } - /** Create a lexicon using the specified data stream */ - public LexiconOutputStream(DataOutput out){ - lexiconStream = out; - } - - /** - * A constructor given the filename. - * @param filename java.lang.String the name of the lexicon file. - */ - public LexiconOutputStream(String filename) { - try { - lexiconStream = new DataOutputStream(Files.writeFileStream(filename)); - } catch (IOException ioe) { - logger.fatal( - "I/O error occured while opening the lexicon file. Stack trace follows.",ioe); - } - } - /** - * A constructor given the filename. - * @param file java.io.File the name of the lexicon file. - */ - public LexiconOutputStream(File file) { - try { - lexiconStream = new DataOutputStream(Files.writeFileStream(file)); - } catch (IOException ioe) { - logger.fatal( - "I/O error occured while opening the lexicon file. Stack trace follows.",ioe); - } - } - - /** A constructor for a LexiconOutputStream given the index path and prefix - * @param path String the path to the index - * @param prefix String the prefix of the filenames in the index - */ - public LexiconOutputStream(String path, String prefix) { - this(path + ApplicationSetup.FILE_SEPARATOR + prefix + ApplicationSetup.LEXICONSUFFIX); - } + protected LexiconOutputStream() { } /** @@ -134,77 +58,18 @@ * Writes a lexicon entry. * @return the number of bytes written to the file. * @throws java.io.IOException if an I/O error occurs - * @param _term the string representation of the term - * @param _termId the terms integer identifier - * @param _documentFrequency the term's document frequency in the collection - * @param _termFrequency the term's frequency in the collection - * @param _endOffset the term's ending byte offset in the inverted file - * @param _endBitOffset the term's ending byte bit-offset in the inverted file - */ - public int writeNextEntry( - String _term, - int _termId, - int _documentFrequency, - int _termFrequency, - long _endOffset, - byte _endBitOffset) - throws IOException { - byte[] tmpBytes = _term.getBytes(); - final int length = tmpBytes.length; - numPointersWritten += _documentFrequency; - numTokensWritten += _termFrequency; - numTermsWritten++; - lexiconStream.write(tmpBytes, 0, length); - /* if an ArrayIndexOutOfBoundsException ocurrs here - * this means that the term is longer than STRING_BYTE_LENGTH */ - lexiconStream.write( - zeroBuffer, - 0, - ApplicationSetup.STRING_BYTE_LENGTH - length); - lexiconStream.writeInt(_termId); - lexiconStream.writeInt(_documentFrequency); - lexiconStream.writeInt(_termFrequency); - lexiconStream.writeLong(_endOffset); - lexiconStream.writeByte(_endBitOffset); - return Lexicon.lexiconEntryLength; - } - /** - * Writes a lexicon entry. - * @return the number of bytes written. - * @throws java.io.IOException if an I/O error occurs - * @param _term the byte[] representation of the term. Using this format means that - * the term does not have to be decoded and recoded every time. - * @param _termId the terms integer identifier - * @param _documentFrequency the term's document frequency in the collection - * @param _termFrequency the term's frequency in the collection - * @param _endOffset the term's ending byte offset in the inverted file - * @param _endBitOffset the term's ending byte bit-offset in the inverted file + * @param _key the key - usually the term + * @param _value the lexicon entry value */ - public int writeNextEntry( - byte[] _term, - int _termId, - int _documentFrequency, - int _termFrequency, - long _endOffset, - byte _endBitOffset) - throws IOException { - final int length = _term.length; - numPointersWritten += _documentFrequency; - numTokensWritten += _termFrequency; + public abstract int writeNextEntry(KEY _key, LexiconEntry _value) throws IOException; + + protected void incrementCounters(TermStatistics t) + { numTermsWritten++; - lexiconStream.write(_term, 0, _term.length); - lexiconStream.write( - zeroBuffer, - 0, - ApplicationSetup.STRING_BYTE_LENGTH - length); - lexiconStream.writeInt(_termId); - lexiconStream.writeInt(_documentFrequency); - lexiconStream.writeInt(_termFrequency); - lexiconStream.writeLong(_endOffset); - lexiconStream.writeByte(_endBitOffset); - return Lexicon.lexiconEntryLength; + numPointersWritten += t.getDocumentFrequency(); + numTokensWritten += t.getFrequency(); } - + /** Returns the number of pointers there would be in an inverted index built using this lexicon (thus far). * This is equal to the sum of the Nts written to this lexicon output stream. */ public long getNumberOfPointersWritten() @@ -224,56 +89,4 @@ { return numTermsWritten; } - - /** - * Sets the bit offset in the last byte of the term's entry in the inverted file. - * @param _endBitOffset byte the bit offset in the last byte of the - * term's entry in the inverted file. - * @deprecated - */ - public void setEndBitOffset(byte _endBitOffset) { - endBitOffset = _endBitOffset; - } - /** - * Sets the ending offset of the term's entry in the inverted file. - * @param _endOffset long The ending byte of the term's - * entry in the inverted file. - * @deprecated - */ - public void setEndOffset(long _endOffset) { - endOffset = _endOffset; - } - /** - * Sets the document frequency for the given term. - * @param _Nt int The document frequency for the given term. - * @deprecated - */ - public void setNt(int _Nt) { - documentFrequency = _Nt; - } - /** - * Sets the string representation of the term. - * @param _term java.lang.String The string representation of - * the seeked term. - * @deprecated - */ - public void setTerm(String _term) { - term = _term; - } - /** - * Sets the term's id. - * @param _termId int the term's identifier. - * @deprecated - */ - public void setTermId(int _termId) { - termId = _termId; - } - /** - * Sets the term frequency for the already found term. - * @param _termFrequency int The term frequency in the collection. - * @deprecated - */ - public void setTF(int _termFrequency) { - termFrequency = _termFrequency; - } } diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/LexiconUtil.java src/uk/ac/gla/terrier/structures/LexiconUtil.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/LexiconUtil.java 1970-01-01 01:00:00.000000000 +0100 +++ src/uk/ac/gla/terrier/structures/LexiconUtil.java 2009-03-03 14:34:49.000000000 +0000 @@ -0,0 +1,22 @@ +package uk.ac.gla.terrier.structures; + +import java.util.Iterator; +import java.util.Map; + +public class LexiconUtil { + + @SuppressWarnings("unchecked") + public static void printLexicon(Index index, String structureName) + { + Iterator> lexiconStream = + (Iterator>)index.getIndexStructureInputStream(structureName); + while (lexiconStream.hasNext()) + { + Map.Entry lee = lexiconStream.next(); + System.out.println(lee.getKey().toString()+","+lee.getValue().toString()); + } + if (lexiconStream instanceof Closeable) { + ((Closeable)lexiconStream).close(); + } + } +} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/MapFileLexicon.java src/uk/ac/gla/terrier/structures/MapFileLexicon.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/MapFileLexicon.java 1970-01-01 01:00:00.000000000 +0100 +++ src/uk/ac/gla/terrier/structures/MapFileLexicon.java 2009-03-03 14:34:49.000000000 +0000 @@ -0,0 +1,343 @@ +package uk.ac.gla.terrier.structures; +import gnu.trove.TIntObjectHashMap; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.util.Arrays; +import java.util.Iterator; +import java.util.Map; +import java.util.Map.Entry; + +import org.apache.hadoop.io.Text; + +import uk.ac.gla.terrier.structures.indexing.LexiconBuilder; +import uk.ac.gla.terrier.structures.maps.MapFile; +import uk.ac.gla.terrier.structures.seralization.FixedSizeWriteableFactory; +import uk.ac.gla.terrier.utility.Files; +import uk.ac.gla.terrier.utility.io.RandomDataInput; +public class MapFileLexicon extends MapLexicon +{ + protected static final String MAPFILE_EXT = ".mapfile"; + protected static final String ID_EXT = ".mapid"; + protected static final String HASH_EXT = ".maphash"; + + static class CharMapBSearchShortcut implements MapFile.MapFileBSearchShortcut + { + final TIntObjectHashMap map; + final int[] defaultReturn; + @SuppressWarnings("unchecked") + public CharMapBSearchShortcut(String path, String prefix, String structureName, int size) throws Exception + { + ObjectInputStream ois = new ObjectInputStream(Files.openFileStream(constructFilename(structureName, path, prefix, HASH_EXT))); + map = (TIntObjectHashMap)ois.readObject(); + ois.close(); + defaultReturn = new int[]{0,size}; + } + + public int[] searchBounds(Text key) throws IOException { + int[] boundaries = map.get(key.charAt(0)); + if (boundaries == null) + return defaultReturn; + return boundaries; + } + } + + static class OnDiskLookup implements Id2EntryIndexLookup, java.io.Closeable + { + final RandomDataInput lexIdFile; + protected static final long SIZE_OF_INT = 4; + public OnDiskLookup(String path, String prefix, String structureName) throws IOException + { + lexIdFile = Files.openFileRandom( + constructFilename(structureName, path, prefix, ID_EXT)); + } + + public int getIndex(int termid) throws IOException + { + lexIdFile.seek(SIZE_OF_INT * (long)termid); + return lexIdFile.readInt(); + } + + public void close() throws IOException + { + lexIdFile.close(); + } + } + + static class InMemoryLookup implements Id2EntryIndexLookup + { + protected final int[] id2index; + public InMemoryLookup(String path, String prefix, String structureName, int size) + throws IOException + { + DataInputStream lexIdFile = new DataInputStream(Files.openFileStream( + constructFilename(structureName, path, prefix, ID_EXT))); + id2index = new int[size]; + for(int i=0;i)index.getIndexStructure(structureName+"-keyfactory"), + (FixedSizeWriteableFactory)index.getIndexStructure(structureName+"-valuefactory"), + index.getIndexProperty("index."+structureName+".termids", "aligned"), + index.getIndexProperty("index."+structureName+".bsearchshortcut", "default") + ); + } + + public MapFileLexicon(String structureName, String path, String prefix, + FixedSizeWriteableFactory keyFactory, + FixedSizeWriteableFactory valueFactory, + String termIdLookup, String termLookup) throws IOException + { + super( + new MapFile( + constructFilename(structureName, path, prefix, MAPFILE_EXT), + false, + keyFactory, + valueFactory) + ); + this.keyFactory = keyFactory; + if (termIdLookup.equals("aligned")) + { + setTermIdLookup(new IdIsIndex()); + } + else if (termIdLookup.equals("file")) + { + setTermIdLookup(new OnDiskLookup(path, prefix, structureName)); + } + else if (termIdLookup.equals("fileinmem")) + { + setTermIdLookup(new InMemoryLookup(path, prefix, structureName, this.map.size())); + } + else if (termIdLookup.equals("disabled")) + { + setTermIdLookup(null); + } + else + { + throw new IOException("Unrecognised value ("+termIdLookup+") for termIdlookup for structure "+structureName); + } + + if (termLookup.equals("charmap")) + { + try{ + ((MapFile)this.map).setBSearchShortcut( + new CharMapBSearchShortcut(path, prefix, structureName, this.map.size())); + } catch (Exception e) { + throw new IOException("Problem loading MapFileBSearchShortcut for "+structureName+": "+ e.getMessage()); + } + } + else if (termLookup.equals("default")) + { + //do nothing + } + else + { + throw new IOException("Unrecognised value ("+termLookup+") for termLookup for structure "+structureName); + } + } + + @Override + public void close() { + super.close(); + } + + public static class MapFileLexiconIterator + implements Iterator>, Closeable + { + protected Iterator> parent; + + @SuppressWarnings("unchecked") + public MapFileLexiconIterator(String structureName, Index index) throws IOException + { + this( + structureName, + index.getPath(), + index.getPrefix(), + (FixedSizeWriteableFactory)index.getIndexStructure(structureName+"-keyfactory"), + (FixedSizeWriteableFactory)index.getIndexStructure(structureName+"-valuefactory")); + } + + public MapFileLexiconIterator(String structureName, String path, String prefix, + FixedSizeWriteableFactory keyFactory, + FixedSizeWriteableFactory valueFactory) throws IOException + { + this(constructFilename(structureName, path, prefix, MAPFILE_EXT), keyFactory, valueFactory); + } + + public MapFileLexiconIterator(String filename, FixedSizeWriteableFactory keyFactory, + FixedSizeWriteableFactory valueFactory) throws IOException + { + this(new MapFile.EntryIterator(filename, keyFactory, valueFactory)); + } + + public MapFileLexiconIterator(Iterator> _parent) + { + parent = _parent; + } + public boolean hasNext() { + return parent.hasNext(); + } + public Entry next() { + return MapLexicon.toStringEntry(parent.next()); + } + public void remove() { + parent.remove(); + } + + public void close() { + if (parent instanceof Closeable) + ((Closeable)parent).close(); + } + } + + public Iterator> iterator() { + return new MapFileLexiconIterator(this.map.entrySet().iterator()); + } + + /** Does two things to a MapFileLexicon: adds the termid lookup file (if required), + * and also creates the lexicon has file. + * @param structureName - name of the index structure that this MapFileLexicon represents + * @param index - the index that the index belongs + * @throws IOException if an IO problem occurs + */ + @SuppressWarnings("unchecked") + public static void optimise(String structureName, Index index, LexiconBuilder.CollectionStaticticsCounter statsCounter) throws IOException + { + final String mapFileFilename = constructFilename(structureName, index.getPath(), index.getPrefix(), MAPFILE_EXT); + final FixedSizeWriteableFactory keyFactory = + (FixedSizeWriteableFactory)index.getIndexStructure(structureName+"-keyfactory"); + final FixedSizeWriteableFactory valueFactory = + (FixedSizeWriteableFactory)index.getIndexStructure(structureName+"-valuefactory"); + final int numEntries = MapFile.numberOfEntries(mapFileFilename, keyFactory, valueFactory); + + //term id lookups + boolean termIdsAligned = true; + int[] termid2index = new int[numEntries]; + int counter= 0; int lastTermId = -1; + + //bsearch reduction + int previousFirstChar = -1; + int firstChar = 0; + final TIntObjectHashMap map = new TIntObjectHashMap(); + + Iterator> iterator = + new MapFile.EntryIterator(mapFileFilename, keyFactory, valueFactory); + while(iterator.hasNext()) + { + Map.Entry lee = iterator.next(); + //System.err.println(lee.toString()); + + //term id + int termId = lee.getValue().getTermId(); + if (! (termId == lastTermId+1)) + termIdsAligned = false; + termid2index[termId] = counter; + lastTermId = termId; + + //bsearch reduction + firstChar = lee.getKey().charAt(0); + if (firstChar!=previousFirstChar) { + int[] boundaries = new int[] {counter, 0}; + map.put(firstChar, boundaries); + previousFirstChar = firstChar; + } + + //increments + statsCounter.count(lee.getValue()); + counter++; + } + if (iterator instanceof Closeable) + ((Closeable)iterator).close(); + + //deal with termids + if (termIdsAligned) + { + index.setIndexProperty("index."+structureName+".termids", "aligned"); + System.err.println("All ids for structure "+structureName+ " are aligned, skipping " + +ID_EXT+ " file"); + } + else + { + DataOutputStream dos = new DataOutputStream(Files.writeFileStream( + constructFilename(structureName, index.getPath(), index.getPrefix(), ID_EXT))); + for(int indexof : termid2index) + dos.writeInt(indexof); + dos.close(); + index.setIndexProperty("index."+structureName+".termids", (numEntries > 15000000) ? "file" : "fileinmem"); + } + + + int[] mapKeys = map.keys(); + Arrays.sort(mapKeys); + final int mapKeysSize = mapKeys.length; + for (int i=0; i +{ + protected FixedSizeWriteableFactory keyFactory; + protected Text tempKey = null; + protected final MapFile.MapFileWriter mapFileWriter; + protected Index index = null; + protected String leValueClassname = null; + protected final String structureName; + + public MapFileLexiconOutputStream(String filename, FixedSizeWriteableFactory _keyFactory) throws IOException + { + mapFileWriter = MapFile.mapFileWrite(filename); + structureName = null; + leValueClassname = null; + index = null; + keyFactory = _keyFactory; + tempKey = keyFactory.newInstance(); + } + + @SuppressWarnings("unchecked") + public MapFileLexiconOutputStream(String path, String prefix, String _structureName, + FixedSizeWriteableFactory _keyFactory) throws IOException + { + super(); + this.structureName = _structureName; + mapFileWriter = MapFile.mapFileWrite(path + + "/"+ prefix + +"." + structureName +".mapfile"); //TODO: could we use MapFileLexicon.constructFilename() + keyFactory = _keyFactory; + tempKey = keyFactory.newInstance(); + } + + @SuppressWarnings("unchecked") + static FixedSizeWriteableFactory getKeyFactory(Index _index, String _structureName) + { + _index.addIndexStructure(_structureName+"-keyfactory", + uk.ac.gla.terrier.structures.seralization.FixedSizeTextFactory.class.getName(), + "java.lang.String", "${max.term.length}"); + _index.flush(); + return (FixedSizeWriteableFactory)_index.getIndexStructure(_structureName+"-keyfactory"); + } + + public MapFileLexiconOutputStream(Index _index, String _structureName, + Class>valueFactoryClass) throws IOException + { + this(_index.getPath(), _index.getPrefix(), _structureName, getKeyFactory(_index, _structureName)); + this.index = _index; + leValueClassname = valueFactoryClass.getName(); + } + + + @Override + public int writeNextEntry(String _key, LexiconEntry _value) throws IOException { + tempKey.set(_key); + mapFileWriter.write(tempKey, _value); + super.incrementCounters(_value); + return keyFactory.getSize() /* + TODO */; + } + + @Override + public void close() + { + try{ + mapFileWriter.close(); + } catch (IOException ioe) {} + + if (index != null) + { + addLexiconToIndex(index, this.structureName, this.leValueClassname); + } + } + + public static void addLexiconToIndex(Index index, String structureName, String leValueClassname) + { + index.addIndexStructure( + structureName+"-valuefactory", + leValueClassname, + "", ""); + index.addIndexStructure( + structureName, + "uk.ac.gla.terrier.structures.MapFileLexicon", + "java.lang.String,uk.ac.gla.terrier.structures.Index", + "structureName,index"); + index.addIndexStructureInputStream( + structureName, + "uk.ac.gla.terrier.structures.MapFileLexicon$MapFileLexiconIterator", + "java.lang.String,uk.ac.gla.terrier.structures.Index", + "structureName,index"); + } + +} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/MapLexicon.java src/uk/ac/gla/terrier/structures/MapLexicon.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/MapLexicon.java 1970-01-01 01:00:00.000000000 +0100 +++ src/uk/ac/gla/terrier/structures/MapLexicon.java 2009-03-03 14:34:49.000000000 +0000 @@ -0,0 +1,89 @@ +package uk.ac.gla.terrier.structures; +import java.io.IOException; +import java.util.Map; + +import org.apache.hadoop.io.Text; +import uk.ac.gla.terrier.structures.seralization.WriteableFactory; + +import uk.ac.gla.terrier.structures.maps.OrderedMap; + +public abstract class MapLexicon extends Lexicon implements Closeable +{ + protected WriteableFactory keyFactory = null; + protected interface Id2EntryIndexLookup + { + int getIndex(int termid) throws IOException; + } + + protected static class IdIsIndex implements Id2EntryIndexLookup + { + public int getIndex(int termid) + { + return termid; + } + } + + final Map map; + Id2EntryIndexLookup idlookup; + + public MapLexicon(Map backingMap) + { + this.map = backingMap; + this.idlookup = new IdIsIndex(); + } + + public MapLexicon(Map backingMap, + Id2EntryIndexLookup idlookupobject) + { + this.map = backingMap; + this.idlookup = idlookupobject; + } + + protected void setTermIdLookup(Id2EntryIndexLookup idlookupobject) + { + this.idlookup = idlookupobject; + } + + public LexiconEntry getLexiconEntry(String term) + { + Text key = keyFactory.newInstance(); + key.set(term); + return map.get(key); + } + + public Map.Entry getIthLexiconEntry(int index) + { + if (! (map instanceof OrderedMap)) + throw new UnsupportedOperationException(); + return toStringEntry(((OrderedMap)map).get(index)); + } + + public Map.Entry getLexiconEntry(int termid) + { + int id; + try{ + id = idlookup.getIndex(termid); + } catch (IOException ioe) { + return null; + } + return getIthLexiconEntry(id); + } + + public int numberOfEntries() + { + return this.map.size(); + } + + static Map.Entry toStringEntry (Map.Entry a) + { + return new LexiconFileEntry(a.getKey().toString(), a.getValue()); + } + + public void close() + { + if (map instanceof Closeable) + ((Closeable)map).close(); + if (idlookup instanceof Closeable) + ((Closeable)idlookup).close(); + } +} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/PersistentHashtable.java src/uk/ac/gla/terrier/structures/PersistentHashtable.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/PersistentHashtable.java 2005-01-17 15:16:14.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/PersistentHashtable.java 1970-01-01 01:00:00.000000000 +0100 @@ -1,26 +0,0 @@ -package uk.ac.gla.terrier.structures; -import java.util.Enumeration; -/** - * This interface does not extend Map, which is the parent interface of - * Hashtable as the JDBM interface has direct string support. This saves - * the casting to Objects and back. - */ -public interface PersistentHashtable -{ - public void clear(); - - public boolean containsKey(String key); - - public boolean equals(Object o); - public String get(String key); - - public int hashCode(); - public boolean isEmpty(); - public Enumeration keys(); - public Enumeration values(); - - public void put(String key, String value); - public void remove(String key); - public int size(); - public void close(); -} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/TermStatistics.java src/uk/ac/gla/terrier/structures/TermStatistics.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/TermStatistics.java 1970-01-01 01:00:00.000000000 +0100 +++ src/uk/ac/gla/terrier/structures/TermStatistics.java 2009-03-03 14:34:49.000000000 +0000 @@ -0,0 +1,10 @@ +package uk.ac.gla.terrier.structures; +public interface TermStatistics +{ + public int getFrequency(); //F + public int getDocumentFrequency(); //Nt + public int getTermId(); + + public void add(TermStatistics e); + public void subtract(TermStatistics e); +} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/UTFBlockLexicon.java src/uk/ac/gla/terrier/structures/UTFBlockLexicon.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/UTFBlockLexicon.java 2009-01-28 20:16:55.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/UTFBlockLexicon.java 1970-01-01 01:00:00.000000000 +0100 @@ -1,316 +0,0 @@ -/* - * Terrier - Terabyte Retriever - * Webpage: http://ir.dcs.gla.ac.uk/terrier - * Contact: terrier{a.}dcs.gla.ac.uk - * University of Glasgow - Department of Computing Science - * http://www.gla.ac.uk/ - * - * The contents of this file are subject to the Mozilla Public License - * Version 1.1 (the "License"); you may not use this file except in - * compliance with the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. - * - * The Original Code is UTFBlockLexicon.java. - * - * The Original Code is Copyright (C) 2004-2009 the University of Glasgow. - * All Rights Reserved. - * - * Contributor(s): - * Douglas Johnson (original author) - * Vassilis Plachouras - */ -package uk.ac.gla.terrier.structures; -import java.io.*; -import java.util.Arrays; - -import org.apache.log4j.Logger; - -import uk.ac.gla.terrier.utility.ApplicationSetup; -import uk.ac.gla.terrier.utility.StringTools; -import uk.ac.gla.terrier.utility.io.RandomDataInput; -import uk.ac.gla.terrier.utility.io.RandomDataOutput; -/** - * A lexicon class that saves the number of - * different blocks a term appears in, using UTF encoding of Strings. It is used only during - * creating a utf block inverted index. After the utf block inverted - * index has been created, the utf block lexicon is transformed into - * a utf lexicon. - * @author Douglas Johnson, Vassilis Plachouras - * @version $Revision: 1.16 $ - */ -public class UTFBlockLexicon extends BlockLexicon { - /** The logger used */ - private static Logger logger = Logger.getRootLogger(); - /** - * The size in bytes of an entry in the lexicon file. An entry corresponds - * to a string, an int (termCode), an int (docf), an int (tf), a long (the - * offset of the end of the term's entry in bytes in the inverted file) and - * a byte (the offset in bits of the last byte of the term's entry in the - * inverted file. - */ - public static final int lexiconEntryLength = - 2+//two bytes for length written by writeUTF - ApplicationSetup.STRING_BYTE_LENGTH //the byte representation of the string, ie 3* MAX_TERM_LENGTH - + 16 //the four integers - + 8 //the long - + 1; //the byte - /** - * A default constructor. - */ - public UTFBlockLexicon() { - super(); - - try { - numberOfLexiconEntries = (int) (lexiconFile.length() / (long)lexiconEntryLength); - bufferInput.mark(3 * lexiconEntryLength); - } catch (IOException ioe) { - logger.fatal("Input/output exception while opening for reading the lexicon file. Stack trace follows",ioe); - } - inputStreamClass = UTFLexiconInputStream.class; - } - - public UTFBlockLexicon(String path, String prefix) - { - this(path + ApplicationSetup.FILE_SEPARATOR + prefix + ApplicationSetup.LEXICONSUFFIX); - } - - /** - * Constructs an instace of BlockLexicon and opens the corresponding file. - * @param lexiconName the name of the lexicon file. - */ - public UTFBlockLexicon(String lexiconName) { - super(lexiconName); - try { - numberOfLexiconEntries = (int) (lexiconFile.length() / (long)lexiconEntryLength); - bufferInput.mark(3 * lexiconEntryLength); - } catch (IOException ioe) { - logger.fatal("Input/output exception while opening for reading the " + - "lexicon file. Stack trace follows",ioe); - } - inputStreamClass = UTFLexiconInputStream.class; - } - - - - /** - * Finds the term given its term code. - * - * @return true if the term is found, else return false - * @param termId - * the term's id - */ - public boolean findTerm(int termId) { - try { - idToOffsetFile.seek((long)termId * 8L); - long lexiconOffset = idToOffsetFile.readLong(); - if (lexiconOffset == 0) { - startOffset = 0; - startBitOffset = 0; - lexiconFile.seek(lexiconOffset); - - term = lexiconFile.readUTF(); - lexiconFile.readFully(bt, 0, ApplicationSetup.STRING_BYTE_LENGTH - StringTools.utf8_length(term)); - this.termId = lexiconFile.readInt(); - documentFrequency = lexiconFile.readInt(); - blockFrequency = lexiconFile.readInt(); - termFrequency = lexiconFile.readInt(); - endOffset = lexiconFile.readLong(); - endBitOffset = lexiconFile.readByte(); - return true; - } else { - lexiconFile.seek(lexiconOffset - 9); - //goes to the lexicon offset minus the long offset and a byte - startOffset = lexiconFile.readLong(); - startBitOffset = lexiconFile.readByte(); - startBitOffset++; - if (startBitOffset == 8) { - startBitOffset = 0; - startOffset++; - } - term = lexiconFile.readUTF(); - lexiconFile.readFully(bt, 0, ApplicationSetup.STRING_BYTE_LENGTH - StringTools.utf8_length(term)); - - this.termId = lexiconFile.readInt(); - documentFrequency = lexiconFile.readInt(); - blockFrequency = lexiconFile.readInt(); - termFrequency = lexiconFile.readInt(); - endOffset = lexiconFile.readLong(); - endBitOffset = lexiconFile.readByte(); - return true; - } - } catch (IOException ioe) { - logger.fatal("Input/Output exception while reading the idToOffset file. Stack trace follows.",ioe); - } - return false; - } - /** - * Performs a binary search in the lexicon in order to locate the given - * term. If the term is located, the properties termCharacters, - * documentFrequency, termFrequency, startOffset, startBitOffset, endOffset - * and endBitOffset contain the values related to the term. - * - * @param _term the term to search for. - * @return true if the term is found, and false otherwise. - */ - public boolean findTerm(String _term) { - Arrays.fill(buffer, (byte) 0); - Arrays.fill(bt, (byte) 0); - byte[] bt = _term.getBytes(); String tmpTerm = null; - //int termLength = ApplicationSetup.STRING_BYTE_LENGTH; - //int _termId = 0; - long low = -1; - long high = numberOfLexiconEntries; - long i; - while (high-low>1) { - - i = (long)(high+low)/2; - try { - lexiconFile.seek((long)i * (long)UTFBlockLexicon.lexiconEntryLength); - tmpTerm = lexiconFile.readUTF(); - lexiconFile.readFully(bt, 0, ApplicationSetup.STRING_BYTE_LENGTH - StringTools.utf8_length(term)); - } catch (IOException ioe) { - logger.fatal( - "Input/Output exception while reading from lexicon file. Stack trace follows.",ioe); - } - - int compareResult = 0; - compareResult = _term.compareTo(tmpTerm); - - if (compareResult < 1) - high = i; - else - low = i; - } - if (high == numberOfLexiconEntries) - return false; - try { - lexiconFile.seek((long)high * (long)UTFBlockLexicon.lexiconEntryLength); - tmpTerm = lexiconFile.readUTF(); - lexiconFile.readFully(bt, 0, ApplicationSetup.STRING_BYTE_LENGTH - StringTools.utf8_length(term)); - } catch (IOException ioe) { - logger.fatal( - "Input/Output exception while reading from lexicon file. Stack trace follows.",ioe); - } - - if (_term.compareTo(tmpTerm)==0) { - try { - findTerm(lexiconFile.readInt()); - return true; - }catch(IOException ioe) { - logger.fatal("Input/Output exception while reading from lexicon file. Stack trace follows.",ioe); - } - } - return false; - } - - /** - * Returns the block frequency for the given term - * @return int The block frequency for the given term - */ - public int getBlockFrequency() { - return blockFrequency; - } - /** - * Seeks the i-th entry of the lexicon. - * @param i - * The index of the entry we are looking for. - * @return true if the entry was found, false otherwise. - */ - public boolean seekEntry(int i) { - try { - if (i > numberOfLexiconEntries) - return false; - if (i == 0) { - lexiconFile.seek((long)i * (long)lexiconEntryLength); - startOffset = 0; - startBitOffset = 0; - term = lexiconFile.readUTF(); - lexiconFile.readFully(bt, 0, ApplicationSetup.STRING_BYTE_LENGTH - StringTools.utf8_length(term)); - - termId = lexiconFile.readInt(); - documentFrequency = lexiconFile.readInt(); - blockFrequency = lexiconFile.readInt(); - termFrequency = lexiconFile.readInt(); - endOffset = lexiconFile.readLong(); - endBitOffset = lexiconFile.readByte(); - return true; - } else { - lexiconFile.seek((long)i * (long)lexiconEntryLength - (long)lexiconEntryLength - + 2L + (long)ApplicationSetup.STRING_BYTE_LENGTH + 12L); - startOffset = lexiconFile.readLong(); - startBitOffset = lexiconFile.readByte(); - startBitOffset++; - if (startBitOffset == 8) { - startBitOffset = 0; - startOffset++; - } - - term = lexiconFile.readUTF(); - lexiconFile.readFully(bt, 0, ApplicationSetup.STRING_BYTE_LENGTH - StringTools.utf8_length(term)); - - termId = lexiconFile.readInt(); - documentFrequency = lexiconFile.readInt(); - blockFrequency = lexiconFile.readInt(); - termFrequency = lexiconFile.readInt(); - endOffset = lexiconFile.readLong(); - endBitOffset = lexiconFile.readByte(); - return true; - } - } catch (IOException ioe) { - logger.fatal("Input/Output exception while reading the idToOffset file. Stack trace follows.",ioe); - } - return false; - } - - /** - * In an already stored entry in the lexicon file, the information about the - * term frequency, the endOffset in bytes, and the endBitOffset in the last - * byte, is updated. The term is specified by the index of the entry. - * - * @return true if the information is updated properly, otherwise return - * false - * @param i the i-th entry - * @param frequency the term's Frequency - * @param endOffset the offset of the ending byte in the inverted file - * @param endBitOffset the offset in bits in the ending byte in the term's entry in - * inverted file - * @deprecated Block Lexicons are used during indexing, but not during - * retrieval. - */ - public boolean updateEntry(int i, int frequency, long endOffset, - byte endBitOffset) { - if (! (lexiconFile instanceof RandomDataOutput)) - return false; - RandomDataOutput _lexiconFile = (RandomDataOutput)lexiconFile; - try { - long lexiconOffset = (long)i * (long)lexiconEntryLength; - //we seek the offset where the frequency should be writen - _lexiconFile.seek(lexiconOffset - + (long)ApplicationSetup.STRING_BYTE_LENGTH + 8L); - _lexiconFile.writeInt(frequency); - _lexiconFile.writeLong(endOffset); - _lexiconFile.writeByte(endBitOffset); - } catch (IOException ioe) { - logger.fatal("Input/Output exception while updating the lexicon file. Stack trace follows.",ioe); - } - return false; - } - - /** returns the number of entries in the lexicon named by f */ - public static int numberOfEntries(File f) - { - return (int)(f.length()/ (long)lexiconEntryLength); - } - - /** returns the number of entries in the lexicon named by filename */ - public static int numberOfEntries(String filename) - { - return numberOfEntries(new File(filename)); - } - -} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/UTFBlockLexiconInputStream.java src/uk/ac/gla/terrier/structures/UTFBlockLexiconInputStream.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/UTFBlockLexiconInputStream.java 2009-01-28 20:16:55.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/UTFBlockLexiconInputStream.java 1970-01-01 01:00:00.000000000 +0100 @@ -1,200 +0,0 @@ -/* - * Terrier - Terabyte Retriever - * Webpage: http://ir.dcs.gla.ac.uk/terrier - * Contact: terrier{a.}dcs.gla.ac.uk - * University of Glasgow - Department of Computing Science - * http://www.gla.ac.uk/ - * - * The contents of this file are subject to the Mozilla Public License - * Version 1.1 (the "License"); you may not use this file except in - * compliance with the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. - * - * The Original Code is BlockLexiconInputStream.java. - * - * The Original Code is Copyright (C) 2004-2009 the University of Glasgow. - * All Rights Reserved. - * - * Contributor(s): - * Douglas Johnson (original author) - * Vassilis Plachouras - */ -package uk.ac.gla.terrier.structures; -import java.io.DataInput; -import java.io.EOFException; -import java.io.File; -import java.io.IOException; -import java.util.Arrays; - -import org.apache.log4j.Logger; - -import uk.ac.gla.terrier.utility.ApplicationSetup; -import uk.ac.gla.terrier.utility.StringTools; -/** - * An input stream for accessing sequentially the entries - * of a block lexicon. - * @author Douglas Johnson, Vassilis Plachouras - * @version $Revision: 1.17 $ - */ -public class UTFBlockLexiconInputStream extends BlockLexiconInputStream { - /** The logger used */ - private static Logger logger = Logger.getRootLogger(); - /** The term represented as an array of bytes.*/ - protected byte[] termCharacters = new byte[ApplicationSetup.STRING_BYTE_LENGTH +2]; - /** - * A default constructor. - */ - public UTFBlockLexiconInputStream() { - super(); - entrySize = UTFBlockLexicon.lexiconEntryLength; - } - /** - * A constructor given the filename. - * @param filename java.lang.String the name of the lexicon file. - */ - public UTFBlockLexiconInputStream(String filename) { - super(filename); - entrySize = UTFBlockLexicon.lexiconEntryLength; - } - /** - * A constructor given the filename. - * @param file java.io.File the name of the lexicon file. - */ - public UTFBlockLexiconInputStream(File file) { - super(file); - entrySize = UTFBlockLexicon.lexiconEntryLength; - } - - /** Read a lexicon from the specified input stream */ - public UTFBlockLexiconInputStream(DataInput in) { - super(in); - entrySize = UTFBlockLexicon.lexiconEntryLength; - } - /** - * Read the next lexicon entry. - * @return the number of bytes read if there is no error, - * otherwise returns -1 in case of EOF - * @throws java.io.IOException if an I/O error occurs - */ - public int readNextEntry() throws IOException { - try { - startBitOffset = (byte) (endBitOffset + 1); - startOffset = endOffset; - if (startBitOffset == 8) { - startOffset = endOffset + 1; - startBitOffset = 0; - } - - term = lexiconStream.readUTF(); - lexiconStream.skipBytes(ApplicationSetup.STRING_BYTE_LENGTH - StringTools.utf8_length(term)); - - termId = lexiconStream.readInt(); - documentFrequency = lexiconStream.readInt(); - blockFrequency = lexiconStream.readInt(); - termFrequency = lexiconStream.readInt(); - endOffset = lexiconStream.readLong(); - endBitOffset = lexiconStream.readByte(); - numPointersRead += documentFrequency; - numTokensRead += termFrequency; - numTermsRead++; - return Lexicon.lexiconEntryLength; - } catch (EOFException eofe) { - return -1; - } - } - - /** - * Returns the number of entries in the lexicon file. - */ - public int numberOfEntries(){ - return (int)(lexiconFilelength / UTFBlockLexicon.lexiconEntryLength); - } - - /** - * Read the next lexicon entry, where the term is saved as a byte array. No attempt is - * made to parse the byte array and the padding bytes into a String. Use this method when - * you want to get the bytes of the string using getTermCharacters(). This method does - * NOT work with getTerm() - * @return the number of bytes read if there is no error, - * otherwise returns -1 in case of EOF - * @throws java.io.IOException if an I/O error occurs - */ - public int readNextEntryBytes() throws IOException { - try { - startBitOffset = (byte) (endBitOffset + 1); - startOffset = endOffset; - if (startBitOffset == 8) { - startOffset = endOffset + 1; - startBitOffset = 0; - } - - Arrays.fill(termCharacters, (byte)0); - lexiconStream.readFully(termCharacters, 0, ApplicationSetup.STRING_BYTE_LENGTH +2); - - termId = lexiconStream.readInt(); - documentFrequency = lexiconStream.readInt(); - blockFrequency = lexiconStream.readInt(); - termFrequency = lexiconStream.readInt(); - endOffset = lexiconStream.readLong(); - endBitOffset = lexiconStream.readByte(); - numPointersRead += documentFrequency; - numTokensRead += termFrequency; - numTermsRead++; - return Lexicon.lexiconEntryLength; - } catch (EOFException eofe) { - return -1; - } - } - - /** - * Prints out the contents of the lexicon file to check. - */ - public void print() { - int i = 0; //counter - int entryLength = Lexicon.lexiconEntryLength; - try { - while (readNextEntry() != -1) { - System.out.println( - "" - + (long)i * (long)entryLength - + ", " - + term.trim() - + ", " - + termId - + ", " - + documentFrequency - + ", " - + blockFrequency - + ", " - + termFrequency - + ", " - + endBitOffset); - i++; - } - } catch (IOException ioe) { - logger.error("Input/Output exception while reading the lexicon index input stream. ", ioe); - } - } - - /** - * Returns the string representation of the term. - * @return the string representation of the already found term. - */ - public String getTerm() - { - return term; - } - - /** - * Returns the bytes of the String. Only valid is readNextEntryByte was used. - * @return the byte array holding the term's byte representation - */ - public byte[] getTermCharacters() { - return termCharacters; - } -} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/UTFBlockLexiconOutputStream.java src/uk/ac/gla/terrier/structures/UTFBlockLexiconOutputStream.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/UTFBlockLexiconOutputStream.java 2009-01-28 20:16:55.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/UTFBlockLexiconOutputStream.java 1970-01-01 01:00:00.000000000 +0100 @@ -1,153 +0,0 @@ -/* - * Terrier - Terabyte Retriever - * Webpage: http://ir.dcs.gla.ac.uk/terrier - * Contact: terrier{a.}dcs.gla.ac.uk - * University of Glasgow - Department of Computing Science - * http://www.gla.ac.uk/ - * - * The contents of this file are subject to the Mozilla Public License - * Version 1.1 (the "License"); you may not use this file except in - * compliance with the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. - * - * The Original Code is BlockLexiconOutputStream.java. - * - * The Original Code is Copyright (C) 2004-2009 the University of Glasgow. - * All Rights Reserved. - * - * Contributor(s): - * Douglas Johnson (original author) - * Vassilis Plachouras - */ -package uk.ac.gla.terrier.structures; -import java.io.DataOutput; -import java.io.File; -import java.io.IOException; - -import uk.ac.gla.terrier.utility.ApplicationSetup; -import uk.ac.gla.terrier.utility.StringTools; -/** - * An output stream for writing the lexicon to a file sequentially. - * @author Douglas Johnson, Vassilis Plachouras - * @version $Revision: 1.10 $ - */ -public class UTFBlockLexiconOutputStream extends BlockLexiconOutputStream { - /** A zero buffer for writing to the file.*/ - private static byte[] zeroBuffer = - new byte[ApplicationSetup.STRING_BYTE_LENGTH]; - /** - * The number of different blocks in which a term appears. - * This is used only during the creation of the inverted - * file and it can be ignored afterwards. - */ - protected int blockFrequency; - /** - * A default constructor. - */ - public UTFBlockLexiconOutputStream() { - super(); - } - /** - * A constructor given the filename. - * @param filename java.lang.String the name of the lexicon file. - */ - public UTFBlockLexiconOutputStream(String filename) { - super(filename); - } - /** - * A constructor given the file. - * @param file java.io.File the lexicon file. - */ - public UTFBlockLexiconOutputStream(File file) { - super(file); - } - - /** Create a lexicon using the specified data stream */ - public UTFBlockLexiconOutputStream(DataOutput out){ - super(out); - } - /** - * Write a lexicon entry. - * @return the number of bytes written if there is no error, otherwise returns -1 in case of EOF - * @throws IOException if an I/O error occurs - * @param term the string representation of the term - * @param termId the terms integer identifier - * @param documentFrequency the term's document frequency in the collection - * @param termFrequency the term's frequency in the collection - * @param endOffset the term's ending byte offset in the inverted file - * @param endBitOffset the term's ending byte bit-offset in the inverted file - */ - public int writeNextEntry( - String term, - int termId, - int documentFrequency, - int termFrequency, - int blockFrequency, - long endOffset, - byte endBitOffset) - throws IOException { - numPointersWritten += documentFrequency; - numTokensWritten += termFrequency; - numTermsWritten++; - lexiconStream.writeUTF(term); - lexiconStream.write( - zeroBuffer, - 0, - ApplicationSetup.STRING_BYTE_LENGTH - StringTools.utf8_length(term)); - lexiconStream.writeInt(termId); - lexiconStream.writeInt(documentFrequency); - lexiconStream.writeInt(blockFrequency); - lexiconStream.writeInt(termFrequency); - lexiconStream.writeLong(endOffset); - lexiconStream.writeByte(endBitOffset); - return Lexicon.lexiconEntryLength; - } - /** - * Write a lexicon entry. - * @return the number of bytes written if there is no error, otherwise returns -1 in case of EOF - * @throws java.io.IOException if an I/O error occurs - * @param term the byte array representation of the term - * @param termId the terms integer identifier - * @param documentFrequency the term's document frequency in the collection - * @param termFrequency the term's frequency in the collection - * @param endOffset the term's ending byte offset in the inverted file - * @param endBitOffset the term's ending byte bit-offset in the inverted file - */ - public int writeNextEntry( - byte[] term, - int termId, - int documentFrequency, - int termFrequency, - int blockFrequency, - long endOffset, - byte endBitOffset) - throws IOException { - final int length = term.length; - numPointersWritten += documentFrequency; - numTokensWritten += termFrequency; - lexiconStream.write(term, 0, length); - lexiconStream.write( - zeroBuffer, - 0, - 2+ApplicationSetup.STRING_BYTE_LENGTH - length); - lexiconStream.writeInt(termId); - lexiconStream.writeInt(documentFrequency); - lexiconStream.writeInt(blockFrequency); - lexiconStream.writeInt(termFrequency); - lexiconStream.writeLong(endOffset); - lexiconStream.writeByte(endBitOffset); - return Lexicon.lexiconEntryLength; - } - /** - * Sets the block frequency for the given term - * @param blockFrequency The new block frequency - */ - public void setBF(int blockFrequency) { - this.blockFrequency = blockFrequency; - } -} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/UTFLexicon.java src/uk/ac/gla/terrier/structures/UTFLexicon.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/UTFLexicon.java 2009-01-28 20:16:55.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/UTFLexicon.java 1970-01-01 01:00:00.000000000 +0100 @@ -1,479 +0,0 @@ -/* - * Terrier - Terabyte Retriever - * Webpage: http://ir.dcs.gla.ac.uk/terrier - * Contact: terrier{a.}dcs.gla.ac.uk - * University of Glasgow - Department of Computing Science - * http://www.gla.ac.uk/ - * - * The contents of this file are subject to the Mozilla Public License - * Version 1.1 (the "License"); you may not use this file except in - * compliance with the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. - * - * The Original Code is Lexicon.java. - * - * The Original Code is Copyright (C) 2004-2009 the University of Glasgow. - * All Rights Reserved. - * - * Contributor(s): - * Gianni Amati (original author) - * Vassilis Plachouras - * Craig Macdonald - */ -package uk.ac.gla.terrier.structures; -import java.io.File; -import java.io.IOException; - -import uk.ac.gla.terrier.utility.ApplicationSetup; -import uk.ac.gla.terrier.utility.StringTools; -import uk.ac.gla.terrier.utility.io.RandomDataOutput; -import org.apache.log4j.Logger; - -/** - * The class that implements the lexicon structure. Apart from the lexicon file, - * which contains the actual data about the terms, and takes its name from - * ApplicationSetup.LEXICON_FILENAME, another file is created and - * used, containing a mapping from the term's code to the offset of the term - * in the lexicon. The name of this file is given by - * ApplicationSetup.LEXICON_INDEX_FILENAME. - * - * @see ApplicationSetup#LEXICON_FILENAME - * @see ApplicationSetup#LEXICON_INDEX_FILENAME - * @author Gianni Amati, Vassilis Plachouras, Craig Macdonald - * @version $Revision: 1.17 $ - */ -public class UTFLexicon extends Lexicon { - /** The logger used */ - private static Logger logger = Logger.getRootLogger(); - /** The term represented as an array of bytes.*/ - protected byte[] termCharacters; - - /** - * The size in bytes of an entry in the lexicon file. - * An entry corresponds to a string, an int (termCode), - * an int (docf), an int (tf), a long (the offset of the end - * of the term's entry in bytes in the inverted file) and - * a byte (the offset in bits of the last byte of the term's entry - * in the inverted file. - */ - public static final int lexiconEntryLength = - - 2+ //two bytes for length written by writeUTF - ApplicationSetup.STRING_BYTE_LENGTH //the byte representation of the string, ie 3* MAX_TERM_LENGTH - - +12 //the three integers - +8 //the long - +1; //the byte - - /** - * A default constructor. - */ - public UTFLexicon() { - super(); - try { - numberOfLexiconEntries = (int) (lexiconFile.length() / (long)UTFLexicon.lexiconEntryLength); - bufferInput.mark(3 * lexiconEntryLength); - } catch (IOException ioe) { - logger.fatal( - "Input/output exception while opening for reading the lexicon file." + - " Stack trace follows",ioe); - } - inputStreamClass = UTFLexiconInputStream.class; - } - public UTFLexicon(String path, String prefix) - { - this(path + ApplicationSetup.FILE_SEPARATOR + prefix + ApplicationSetup.LEXICONSUFFIX); - } - - /** - * Constructs an instace of Lexicon and opens - * the corresponding file. - * - * @param lexiconName the name of the lexicon file. - */ - public UTFLexicon(String lexiconName) { - super(lexiconName); - try { - numberOfLexiconEntries = (int) (lexiconFile.length() / (long)UTFLexicon.lexiconEntryLength); - bufferInput.mark(3 * lexiconEntryLength); - } catch (IOException ioe) { - logger.fatal( - "Input/output exception while opening for reading the lexicon file. Stack trace follows",ioe); - } - inputStreamClass = UTFLexiconInputStream.class; - } - - - /** - * Finds the term given its term code. - * - * @return true if the term is found, else return false - * @param _termId the term's identifier - */ - public boolean findTerm(int _termId) { - try { - idToOffsetFile.seek((long)_termId * 8L); - long lexiconOffset = idToOffsetFile.readLong(); - if (lexiconOffset == 0) { - startOffset = 0; - startBitOffset = 0; - lexiconFile.seek(lexiconOffset); - - term = lexiconFile.readUTF(); - lexiconFile.readFully(bt, 0, ApplicationSetup.STRING_BYTE_LENGTH - StringTools.utf8_length(term)); - - termId = lexiconFile.readInt(); - documentFrequency = lexiconFile.readInt(); - termFrequency = lexiconFile.readInt(); - endOffset = lexiconFile.readLong(); - endBitOffset = lexiconFile.readByte(); - return true; - } else { - lexiconFile.seek(lexiconOffset - 9); - //goes to the lexicon offset minus the long offset and a byte - startOffset = lexiconFile.readLong(); - startBitOffset = lexiconFile.readByte(); - startBitOffset++; - if (startBitOffset == 8) { - startBitOffset = 0; - startOffset++; - } - term = lexiconFile.readUTF(); - lexiconFile.readFully(bt, 0, ApplicationSetup.STRING_BYTE_LENGTH - StringTools.utf8_length(term)); - - termId = lexiconFile.readInt(); - documentFrequency = lexiconFile.readInt(); - termFrequency = lexiconFile.readInt(); - endOffset = lexiconFile.readLong(); - endBitOffset = lexiconFile.readByte(); - return true; - } - } catch (IOException ioe) { - logger.fatal( - "Input/Output exception while reading the idToOffset file. Stack trace follows.",ioe); - } - return false; - } - /** - * Performs a binary search in the lexicon - * in order to locate the given term. - * If the term is located, the properties - * termCharacters, documentFrequency, - * termFrequency, startOffset, startBitOffset, - * endOffset and endBitOffset contain the - * values related to the term. - * @param _term The term to search for. - * @return true if the term is found, and false otherwise. - */ - public boolean findTerm(String _term) { - byte[] bt = new byte[ApplicationSetup.STRING_BYTE_LENGTH]; - - //int termLength = ApplicationSetup.STRING_BYTE_LENGTH; - //int _termId = 0; - long low = -1; - long high = numberOfLexiconEntries; - long i; - String currentTerm = null; - while (high-low>1) { - - i = (long)(high+low)/2; - try { - lexiconFile.seek((long)i * (long)lexiconEntryLength); - currentTerm = lexiconFile.readUTF(); - //we don't need to take in the padding as we're seeking between entries - } catch (IOException ioe) { - logger.fatal( - "Input/Output exception while reading from lexicon file. Stack trace follows.",ioe); - } - - if (_term.compareTo(currentTerm) < 1) - high = i; - else - low = i; - } - if (high == numberOfLexiconEntries) - return false; - try { - lexiconFile.seek((long)high * (long)lexiconEntryLength); - currentTerm = lexiconFile.readUTF(); - } catch (IOException ioe) { - logger.fatal( - "Input/Output exception while reading from lexicon file. Stack trace follows.",ioe); - } - - if (_term.compareTo(currentTerm) == 0) { - try { - lexiconFile.readFully(bt, 0, ApplicationSetup.STRING_BYTE_LENGTH- StringTools.utf8_length(currentTerm)); - findTerm(lexiconFile.readInt()); - return true; - }catch(IOException ioe) { - logger.fatal("Input/Output exception while reading from lexicon file. Stack trace follows.",ioe); - } - } - return false; - } - - /** - * Seeks the i-th entry of the lexicon. - * TODO read a byte array from the file and decode it, - * instead of reading the different pieces of - * information separately. - * @param i The index of the entry we are looking for. - * @return true if the entry was found, false otherwise. - */ - public boolean seekEntry(int i) { - try { - if (i > numberOfLexiconEntries) - return false; - if (i == 0) { - lexiconFile.seek((long)i * (long)lexiconEntryLength); - startOffset = 0; - startBitOffset = 0; - - term = lexiconFile.readUTF(); - lexiconFile.readFully(bt, 0, ApplicationSetup.STRING_BYTE_LENGTH - StringTools.utf8_length(term)); - - termId = lexiconFile.readInt(); - documentFrequency = lexiconFile.readInt(); - termFrequency = lexiconFile.readInt(); - endOffset = lexiconFile.readLong(); - endBitOffset = lexiconFile.readByte(); - return true; - } else { - lexiconFile.seek( - (long)i * (long)lexiconEntryLength - - (long)lexiconEntryLength - + 2L//two bytes for the string length written by writeUTF - + (long)ApplicationSetup.STRING_BYTE_LENGTH - + 12L); - startOffset = lexiconFile.readLong(); - startBitOffset = lexiconFile.readByte(); - startBitOffset++; - if (startBitOffset == 8) { - startBitOffset = 0; - startOffset++; - } - - term = lexiconFile.readUTF(); - lexiconFile.readFully(bt, 0, ApplicationSetup.STRING_BYTE_LENGTH - StringTools.utf8_length(term)); - - termId = lexiconFile.readInt(); - documentFrequency = lexiconFile.readInt(); - termFrequency = lexiconFile.readInt(); - endOffset = lexiconFile.readLong(); - endBitOffset = lexiconFile.readByte(); - return true; - } - } catch (IOException ioe) { - logger.fatal( - "Input/Output exception while reading the idToOffset file. " + - "Stack trace follows.",ioe); - } - return false; - } - - /** Returns a LexiconEntry describing all the information in the lexicon about the term - * denoted by termid - * @param termid the termid of the term of interest - * @return LexiconEntry all information about the term's entry in the lexicon. null if termid not found */ - public LexiconEntry getLexiconEntry(int termid) { - /* TODO: improve this to the effectiveness level of getLexiconEntry() */ - if (! findTerm(termid)) - return null; - LexiconEntry le = new LexiconEntry(); - le.termId = this.termId; - le.term = this.term.trim(); - le.TF = this.termFrequency; - le.n_t = this.documentFrequency; - le.startOffset = this.startOffset; - le.startBitOffset = this.startBitOffset; - le.endOffset = this.endOffset; - le.endBitOffset = this.endBitOffset; - return le; - } - - /** Returns a LexiconEntry describing all the information in the lexicon about the term - * denoted by _term - * @param _term the String term that is of interest - * @return LexiconEntry all information about the term's entry in the lexicon. null if termid not found */ - public LexiconEntry getLexiconEntry(String _term) { - int low = -1; - int high = (int)numberOfLexiconEntries; - int i; - int compareStrings; - String term; - byte[] buffer = new byte[lexiconEntryLength+9]; //to get the start offsets as well - - if (USE_HASH) { - int firstChar = _term.charAt(0); - int[] boundaries = (int[])map.get(firstChar); - if (boundaries != null) - { - low = boundaries[0]; - high = boundaries[1]; - } - //System.out.println("lexicon use hash: " + low + " " + high); - } - - try { - while (high-low>1) { - - i = (high + low)/2; - if (i==0) { - lexiconFile.seek(0); - lexiconFile.readFully(buffer, 0, lexiconEntryLength); - term = lexiconFile.readUTF(); - //new String(buffer,0,ApplicationSetup.STRING_BYTE_LENGTH).trim(); - } else { - lexiconFile.seek((long)i * (long)lexiconEntryLength); - term = lexiconFile.readUTF(); - //term = new String(buffer,9,ApplicationSetup.STRING_BYTE_LENGTH).trim(); - } - - if ((compareStrings = _term.compareTo(term))< 0) - high = i; - else if (compareStrings > 0) - low = i; - else { //read the rest and return the data - if (i==0) - { - lexiconFile.seek(0); - lexiconFile.readFully(buffer, 0, lexiconEntryLength); - } - else - { - lexiconFile.seek((long)i * (long)(lexiconEntryLength) -9); - lexiconFile.readFully(buffer, 0, lexiconEntryLength+9); - } - return getLexiconEntryFromBuffer(buffer, term, i); - } - } - - if (high == numberOfLexiconEntries) - return null; - - if (high == 0) { - lexiconFile.seek(0); - term = lexiconFile.readUTF(); - lexiconFile.seek(0); - lexiconFile.readFully(buffer, 0, lexiconEntryLength); - } else { - lexiconFile.seek((long)high * (long)lexiconEntryLength); - term = lexiconFile.readUTF(); - lexiconFile.seek((long)high * (long)(lexiconEntryLength) -9); - lexiconFile.readFully(buffer, 0, lexiconEntryLength+9); - } - - if (_term.compareTo(term) == 0) { - return getLexiconEntryFromBuffer(buffer, term, high); - } - } catch(IOException ioe) { - logger.fatal("IOException while binary searching the lexicon: " , ioe); - } - return null; - } - - protected LexiconEntry getLexiconEntryFromBuffer(byte[] buffer, String term, int index) { - int offset; - LexiconEntry lEntry = new LexiconEntry(); - lEntry.term = term; - if (index==0) { - lEntry.startOffset = 0; - lEntry.startBitOffset = 0; - offset = ApplicationSetup.STRING_BYTE_LENGTH+2; - } else { - offset = 0; -// lEntry.startOffset = -// (((((((buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | -// buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff; - - long startOffset = (buffer[offset++] & 0xff); - for (int j=0; j<7; j++) - startOffset = startOffset<<8 | (buffer[offset++] & 0xff); - lEntry.startOffset = startOffset; - - - lEntry.startBitOffset = (byte)(buffer[offset++]&0xff); - if (++lEntry.startBitOffset == 8) { - lEntry.startBitOffset = 0; - lEntry.startOffset++; - } - - offset += 2+ApplicationSetup.STRING_BYTE_LENGTH; - } - lEntry.termId = - (((buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff; - lEntry.n_t = - (((buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff; - lEntry.TF = - (((buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff; - -// lEntry.endOffset = -// (((((((buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | -// buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff; - - long endOffset = (int)(buffer[offset++] & 0xff); - for (int j=0; j<7; j++) - endOffset = endOffset<<8 | (buffer[offset++] & 0xff); - lEntry.endOffset = endOffset; - lEntry.endBitOffset = (byte)(buffer[offset]&0xff); - return lEntry; - } - - - /** - * In an already stored entry in the lexicon - * file, the information about the term frequency, - * the endOffset in bytes, and the endBitOffset in the last - * byte, is updated. The term is specified by the index of the entry. - * - * @return true if the information is updated properly, - * otherwise return false - * @param i the i-th entry - * @param frequency the term's Frequency - * @param endOffset the offset of the ending byte in the inverted file - * @param endBitOffset the offset in bits in the ending byte - * in the term's entry in inverted file - * @deprecated The Lexicon class is only used for reading the - * lexicon file, and not for writing any information. - */ - public boolean updateEntry( - int i, - int frequency, - long endOffset, - byte endBitOffset) { - if (! (lexiconFile instanceof RandomDataOutput)) - return false; - RandomDataOutput _lexiconFile = (RandomDataOutput)lexiconFile; - try { - long lexiconOffset = (long)i * (long)lexiconEntryLength; - //we seek the offset where the frequency should be writen - _lexiconFile.seek( //utf length, string max length, termid, tf - lexiconOffset + 2+ ApplicationSetup.STRING_BYTE_LENGTH + 8); - _lexiconFile.writeInt(frequency); - _lexiconFile.writeLong(endOffset); - _lexiconFile.writeByte(endBitOffset); - } catch (IOException ioe) { - logger.fatal( - "Input/Output exception while updating the lexicon file. " + - "Stack trace follows.",ioe); - } - return false; - } - - public static int numberOfEntries(File f) - { - return (int)(f.length()/ (long)lexiconEntryLength); - } - - public static int numberOfEntries(String filename) - { - return numberOfEntries(new File(filename)); - } - -} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/UTFLexiconInputStream.java src/uk/ac/gla/terrier/structures/UTFLexiconInputStream.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/UTFLexiconInputStream.java 2009-01-28 20:16:55.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/UTFLexiconInputStream.java 1970-01-01 01:00:00.000000000 +0100 @@ -1,173 +0,0 @@ -/* - * Terrier - Terabyte Retriever - * Webpage: http://ir.dcs.gla.ac.uk/terrier - * Contact: terrier{a.}dcs.gla.ac.uk - * University of Glasgow - Department of Computing Science - * http://www.gla.ac.uk/ - * - * The contents of this file are subject to the Mozilla Public License - * Version 1.1 (the "License"); you may not use this file except in - * compliance with the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. - * - * The Original Code is LexiconInputStream.java. - * - * The Original Code is Copyright (C) 2004-2009 the University of Glasgow. - * All Rights Reserved. - * - * Contributor(s): - * Vassilis Plachouras (original author) - * Craig Macdonald - */ -package uk.ac.gla.terrier.structures; -import java.io.DataInput; -import java.io.EOFException; -import java.io.File; -import java.io.IOException; -import java.util.Arrays; - -import uk.ac.gla.terrier.utility.ApplicationSetup; -import uk.ac.gla.terrier.utility.StringTools; -/** - * This class implements an input stream for the lexicon structure. - * @author Vassilis Plachouras, Craig Macdonald - * @version $Revision: 1.16 $ - */ -public class UTFLexiconInputStream extends LexiconInputStream { - /** A zero buffer for writing to the file.*/ - protected byte[] junkBuffer = new byte[ApplicationSetup.STRING_BYTE_LENGTH+2]; - - /** - * A default constructor. - */ - public UTFLexiconInputStream() { - super(); - entrySize = UTFLexicon.lexiconEntryLength; - termCharacters = new byte[ApplicationSetup.STRING_BYTE_LENGTH +2]; - } - /** - * A constructor given the filename. - * @param filename java.lang.String the name of the lexicon file. - */ - public UTFLexiconInputStream(String filename) { - super(filename); - entrySize = UTFLexicon.lexiconEntryLength; - termCharacters = new byte[ApplicationSetup.STRING_BYTE_LENGTH +2]; - } - /** - * A constructor given the filename. - * @param file java.io.File the name of the lexicon file. - */ - public UTFLexiconInputStream(File file) { - super(file); - entrySize = UTFLexicon.lexiconEntryLength; - termCharacters = new byte[ApplicationSetup.STRING_BYTE_LENGTH +2]; - } - - public UTFLexiconInputStream(String path, String prefix) { - this(path + ApplicationSetup.FILE_SEPARATOR + prefix + ApplicationSetup.LEXICONSUFFIX); - } - - /** Read a lexicon from the specified input stream */ - public UTFLexiconInputStream(DataInput in) { - super(in); - entrySize = UTFLexicon.lexiconEntryLength; - } - - /** - * Read the next lexicon entry, where the term is parsed as a string. - * This method does NOT work with getTermCharacters() - use readNextEntryBytes() - * iterator for that. - * @return the number of bytes read if there is no error, - * otherwise returns -1 in case of EOF - * @throws java.io.IOException if an I/O error occurs - */ - public int readNextEntry() throws IOException { - try { - startBitOffset = (byte) (endBitOffset + 1); - startOffset = endOffset; - if (startBitOffset == 8) { - startOffset = endOffset + 1; - startBitOffset = 0; - } - - term = lexiconStream.readUTF(); - lexiconStream.readFully(junkBuffer, 0, ApplicationSetup.STRING_BYTE_LENGTH - StringTools.utf8_length(term)); - - termId = lexiconStream.readInt(); - documentFrequency = lexiconStream.readInt(); - termFrequency = lexiconStream.readInt(); - endOffset = lexiconStream.readLong(); - endBitOffset = lexiconStream.readByte(); - numPointersRead += documentFrequency; - numTokensRead += termFrequency; - numTermsRead++; - return Lexicon.lexiconEntryLength; - } catch (EOFException eofe) { - return -1; - } - } - - /** - * Read the next lexicon entry, where the term is saved as a byte array. No attempt is - * made to parse the byte array and the padding bytes into a String. Use this method when - * you want to get the bytes of the string using getTermCharacters(). This method does - * NOT work with getTerm() - * @return the number of bytes read if there is no error, - * otherwise returns -1 in case of EOF - * @throws java.io.IOException if an I/O error occurs - */ - public int readNextEntryBytes() throws IOException { - try { - startBitOffset = (byte) (endBitOffset + 1); - startOffset = endOffset; - if (startBitOffset == 8) { - startOffset = endOffset + 1; - startBitOffset = 0; - } - - Arrays.fill(termCharacters, (byte)0); - lexiconStream.readFully(termCharacters, 0, ApplicationSetup.STRING_BYTE_LENGTH +2); - - termId = lexiconStream.readInt(); - documentFrequency = lexiconStream.readInt(); - termFrequency = lexiconStream.readInt(); - endOffset = lexiconStream.readLong(); - endBitOffset = lexiconStream.readByte(); - numPointersRead += documentFrequency; - numTokensRead += termFrequency; - numTermsRead++; - return Lexicon.lexiconEntryLength; - } catch (EOFException eofe) { - return -1; - } - } - /** - * Returns the number of entries in the lexicon file. - */ - public int numberOfEntries(){ - return (int)(lexiconFilelength / UTFLexicon.lexiconEntryLength); - } - - /** - * Returns the string representation of the term. - * @return the string representation of the already found term. - */ - public String getTerm() { - return term; - } - - /** - * Returns the bytes of the String. Only valid is readNextEntryByte was used. - * @return the byte array holding the term's byte representation - */ - public byte[] getTermCharacters() { - return termCharacters; - } - -} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/UTFLexiconOutputStream.java src/uk/ac/gla/terrier/structures/UTFLexiconOutputStream.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/UTFLexiconOutputStream.java 2009-01-28 20:16:57.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/UTFLexiconOutputStream.java 1970-01-01 01:00:00.000000000 +0100 @@ -1,146 +0,0 @@ -/* - * Terrier - Terabyte Retriever - * Webpage: http://ir.dcs.gla.ac.uk/terrier - * Contact: terrier{a.}dcs.gla.ac.uk - * University of Glasgow - Department of Computing Science - * http://www.gla.ac.uk/ - * - * The contents of this file are subject to the Mozilla Public License - * Version 1.1 (the "License"); you may not use this file except in - * compliance with the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. - * - * The Original Code is LexiconOutputStream.java. - * - * The Original Code is Copyright (C) 2004-2009 the University of Glasgow. - * All Rights Reserved. - * - * Contributor(s): - * Vassilis Plachouras (original author) - * Craig Macdonald - */ -package uk.ac.gla.terrier.structures; -import java.io.DataOutput; -import java.io.File; -import java.io.IOException; - -import uk.ac.gla.terrier.utility.ApplicationSetup; -import uk.ac.gla.terrier.utility.StringTools; -/** - * This class implements an output stream for the lexicon structure. - * @author Vassilis Plachouras, Craig Macdonald - * @version $Revision: 1.12 $ - */ -public class UTFLexiconOutputStream extends LexiconOutputStream { - /** A zero buffer for writing to the file.*/ - private byte[] zeroBuffer = new byte[ApplicationSetup.STRING_BYTE_LENGTH]; - - /** - * A default constructor. - */ - public UTFLexiconOutputStream() { - super(); - } - /** - * A constructor given the filename. - * @param filename java.lang.String the name of the lexicon file. - */ - public UTFLexiconOutputStream(String filename) { - super(filename); - } - /** - * A constructor given the filename. - * @param file java.io.File the name of the lexicon file. - */ - public UTFLexiconOutputStream(File file) { - super(file); - } - - public UTFLexiconOutputStream(String path, String prefix) - { - super(path, prefix); - } - - /** Create a lexicon using the specified data stream */ - public UTFLexiconOutputStream(DataOutput out){ - super(out); - } - - /** - * Writes a lexicon entry. - * @return the number of bytes written to the file. - * @throws java.io.IOException if an I/O error occurs - * @param _term the string representation of the term - * @param _termId the terms integer identifier - * @param _documentFrequency the term's document frequency in the collection - * @param _termFrequency the term's frequency in the collection - * @param _endOffset the term's ending byte offset in the inverted file - * @param _endBitOffset the term's ending byte bit-offset in the inverted file - */ - public int writeNextEntry( - String _term, - int _termId, - int _documentFrequency, - int _termFrequency, - long _endOffset, - byte _endBitOffset) - throws IOException { - numPointersWritten += _documentFrequency; - numTokensWritten += _termFrequency; - numTermsWritten++; - lexiconStream.writeUTF(_term); - lexiconStream.write( - zeroBuffer, - 0, - ApplicationSetup.STRING_BYTE_LENGTH - StringTools.utf8_length(_term)); - lexiconStream.writeInt(_termId); - lexiconStream.writeInt(_documentFrequency); - lexiconStream.writeInt(_termFrequency); - lexiconStream.writeLong(_endOffset); - lexiconStream.writeByte(_endBitOffset); - return UTFLexicon.lexiconEntryLength; - } - /** - * Writes a lexicon entry. - * @return the number of bytes written. - * @throws java.io.IOException if an I/O error occurs - * @param _term the byte representation of the term, as written by DataInput.writeUTF(). This - * should be ApplicationSetup.STRING_BYTE_LENGTH +2 in length - * @param _termId the terms integer identifier - * @param _documentFrequency the term's document frequency in the collection - * @param _termFrequency the term's frequency in the collection - * @param _endOffset the term's ending byte offset in the inverted file - * @param _endBitOffset the term's ending byte bit-offset in the inverted file - */ - - public int writeNextEntry( - byte[] _term, - int _termId, - int _documentFrequency, - int _termFrequency, - long _endOffset, - byte _endBitOffset) - throws IOException { - final int length = _term.length; - numPointersWritten += _documentFrequency; - numTokensWritten += _termFrequency; - numTermsWritten++; - lexiconStream.write(_term, 0, length); - lexiconStream.write( - zeroBuffer, - 0, - 2+ApplicationSetup.STRING_BYTE_LENGTH - length); - lexiconStream.writeInt(_termId); - lexiconStream.writeInt(_documentFrequency); - lexiconStream.writeInt(_termFrequency); - lexiconStream.writeLong(_endOffset); - lexiconStream.writeByte(_endBitOffset); - return UTFLexicon.lexiconEntryLength; - } - -} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/BlockDirectIndexBuilder.java src/uk/ac/gla/terrier/structures/indexing/BlockDirectIndexBuilder.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/BlockDirectIndexBuilder.java 2009-01-28 20:16:57.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/indexing/BlockDirectIndexBuilder.java 2009-03-03 14:34:49.000000000 +0000 @@ -87,13 +87,6 @@ /* find out where we are */ FilePosition rtr = getLastEndOffset(); - /* flush to disk if necessary */ - if (DocumentsSinceFlush++ >= DocumentsPerFlush) - { - flushBuffer(); - resetBuffer(); - DocumentsSinceFlush = 0; - } /* and then return where the position of the last * write to the DirectIndex */ return rtr; @@ -106,8 +99,6 @@ */ public void finishedCollections() { - flushBuffer(); - resetBuffer(); DocumentsSinceFlush = 0; logger.info("flush direct index"); try{ diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/BlockInvertedIndexBuilder.java src/uk/ac/gla/terrier/structures/indexing/BlockInvertedIndexBuilder.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/BlockInvertedIndexBuilder.java 2009-01-28 20:16:57.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/indexing/BlockInvertedIndexBuilder.java 2009-03-03 14:34:49.000000000 +0000 @@ -34,15 +34,18 @@ import java.io.DataOutputStream; import java.io.IOException; import java.util.ArrayList; +import java.util.Iterator; +import java.util.Map; import org.apache.log4j.Logger; import uk.ac.gla.terrier.structures.BlockDirectIndexInputStream; -import uk.ac.gla.terrier.structures.BlockLexiconInputStream; -import uk.ac.gla.terrier.structures.DocumentIndex; +import uk.ac.gla.terrier.structures.BlockTermStatistics; +import uk.ac.gla.terrier.structures.Closeable; import uk.ac.gla.terrier.structures.Index; -import uk.ac.gla.terrier.structures.LexiconInputStream; +import uk.ac.gla.terrier.structures.LexiconEntry; import uk.ac.gla.terrier.structures.LexiconOutputStream; +import uk.ac.gla.terrier.structures.MapFileLexicon; import uk.ac.gla.terrier.utility.ApplicationSetup; import uk.ac.gla.terrier.utility.FieldScore; import uk.ac.gla.terrier.utility.Files; @@ -99,42 +102,9 @@ private static Logger logger = Logger.getRootLogger(); protected String finalLexiconClass = "uk.ac.gla.terrier.structures.Lexicon"; - protected String finalLexiconInputStreamClass = "uk.ac.gla.terrier.structures.LexiconInputStream"; - - /** - * Creates an instance of the BlockInvertedIndex class. - * @deprecated - */ - public BlockInvertedIndexBuilder() { - this(ApplicationSetup.TERRIER_INDEX_PATH, - ApplicationSetup.TERRIER_INDEX_PREFIX); - } - - /** - * Creates an instance of the BlockInvertedIndex class using the given - * filename. - * - * @param filename - * the name of the inverted file - * @deprecated use this() or this(String, String) instead - */ - public BlockInvertedIndexBuilder(String filename) { - super(filename); - lexiconInputStream = BlockLexiconInputStream.class; - lexiconOutputStream = LexiconOutputStream.class; - } - /** - * @deprecated - */ - public BlockInvertedIndexBuilder(String path, String prefix) { - super(path, prefix); - lexiconInputStream = BlockLexiconInputStream.class; - lexiconOutputStream = LexiconOutputStream.class; - } - - public BlockInvertedIndexBuilder(Index index) { - super(index); - lexiconInputStream = BlockLexiconInputStream.class; + + public BlockInvertedIndexBuilder(Index index, String structureName) { + super(index, structureName); lexiconOutputStream = LexiconOutputStream.class; } @@ -145,26 +115,23 @@ * need to read the direct file is related to the parameter M, and * consequently to the size of the available memory. */ + @SuppressWarnings("unchecked") public void createInvertedIndex() { numberOfPointersPerIteration = Integer.parseInt(ApplicationSetup.getProperty("invertedfile.processpointers", "2000000")); processTerms = Integer.parseInt(ApplicationSetup.getProperty("invertedfile.processterms", "25000")); try { Runtime r = Runtime.getRuntime(); logger.info("creating block inverted index"); - final String LexiconFilename = indexPathPrefix - + ApplicationSetup.LEXICONSUFFIX; - final String DocumentIndexFilename = indexPathPrefix - + ApplicationSetup.DOC_INDEX_SUFFIX; - DocumentIndex docIndex = new DocumentIndex(DocumentIndexFilename); - final int numberOfDocuments = docIndex.getNumberOfDocuments(); - docIndex.close(); + final String LexiconFilename = index.getPath() + "/" + index.getPrefix() + ".lexicon"; + final int numberOfDocuments = index.getCollectionStatistics().getNumberOfDocuments(); long assumedNumberOfPointers = Long.parseLong(index.getIndexProperty("num.Pointers", "0")); long numberOfTokens = 0; long numberOfPointers = 0; - BlockLexiconInputStream lexiconStream = (BlockLexiconInputStream) getLexInputStream(LexiconFilename); - numberOfUniqueTerms = lexiconStream.numberOfEntries(); + int numberOfUniqueTerms = index.getLexicon().numberOfEntries(); + Iterator> lexiconStream = (Iterator>)this.index.getIndexStructureInputStream("lexicon"); + // A temporary file for storing the updated // lexicon file, after creating the inverted file DataOutputStream dos = new DataOutputStream(Files.writeFileStream(LexiconFilename.concat(".tmp2"))); @@ -299,49 +266,50 @@ this.numberOfTokens = numberOfTokens; this.numberOfPointers = numberOfPointers; file.close(); - lexiconStream.close(); + + if (lexiconStream instanceof Closeable) { + ((Closeable)lexiconStream).close(); + } dos.close(); // finalising the lexicon file with the updated information // on the frequencies and the offsets - BlockLexiconInputStream lis = (BlockLexiconInputStream) getLexInputStream(LexiconFilename); - // reading the original lexicon - LexiconOutputStream los = getLexOutputStream(LexiconFilename - .concat(".tmp3")); - // the updated lexicon +// finalising the lexicon file with the updated information + //on the frequencies and the offsets + //reading the original lexicon + lexiconStream = (Iterator>)index.getIndexStructureInputStream("lexicon"); + + + //the updated lexicon + LexiconOutputStream los = getLexOutputStream("tmplexicon"); + + //the temporary data containing the offsets DataInputStream dis = new DataInputStream(Files.openFileStream(LexiconFilename.concat(".tmp2"))); - - // the temporary data - while (lis.readNextEntryBytes() != -1) { - los.writeNextEntry(lis.getTermCharacters(), lis.getTermId(), - lis.getNt(), - // lis.getBlockFrequency(), - dis.readInt(), - // the term frequency - dis.readLong(), // the ending byte offset - dis.readByte()); + + while(lexiconStream.hasNext()) + { + Map.Entry lee = lexiconStream.next(); + LexiconEntry value = lee.getValue(); + value.setPosition(dis.readLong(), dis.readByte()); + los.writeNextEntry(lee.getKey(), value); } - lis.close(); los.close(); dis.close(); - if (! Files.delete(LexiconFilename)) - logger.error("delete file .lex failed!"); - if (! Files.delete(LexiconFilename.concat(".tmp2"))) - logger.error("delete file .lex.tmp2 failed!"); - if (! Files.rename(LexiconFilename.concat(".tmp3"), LexiconFilename)) - logger.error("rename file .lex.tmp3 to .lex failed!"); - - index.addIndexStructure("lexicon",finalLexiconClass); - index.addIndexStructureInputStream("lexicon",finalLexiconInputStreamClass); + Files.delete(LexiconFilename.concat(".tmp2")); + MapFileLexicon.deleteMapFileLexicon("lexicon", index.getPath(), index.getPrefix()); + MapFileLexicon.renameMapFileLexicon("tmplexicon", index.getPath(), index.getPrefix(), "lexicon", index.getPath(), index.getPrefix()); + + //TODO : BlockInvertedIndexBuilder should change the Lexicon to use BasicLexiconEntry instead of BlockLexiconEntry + index.addIndexStructure( - "inverted", + structureName, "uk.ac.gla.terrier.structures.BlockInvertedIndex", - "uk.ac.gla.terrier.structures.Lexicon,java.lang.String,java.lang.String", - "lexicon,path,prefix"); - index.addIndexStructureInputStream( - "inverted", - "uk.ac.gla.terrier.structures.BlockInvertedIndexInputStream", - "java.lang.String,java.lang.String,uk.ac.gla.terrier.structures.LexiconInputStream", - "path,prefix,lexicon-inputstream"); + "uk.ac.gla.terrier.structures.Index,java.lang.String", + "index,structureName"); + index.addIndexStructureInputStream( + structureName, + "uk.ac.gla.terrier.structures.BlockInvertedIndexInputStream", + "uk.ac.gla.terrier.structures.Index,java.lang.String,java.util.Iterator", + "index,structureName,lexicon-inputstream"); index.setIndexProperty("num.inverted.fields.bits", ""+FieldScore.FIELDS_COUNT ); //these should be already set, but in case their not index.setIndexProperty("num.Terms", ""+numberOfUniqueTerms); @@ -353,114 +321,20 @@ logger.error("IOException occured during creating the inverted file. Stack trace follows.", ioe); } } - - /** - * Iterates through the lexicon, until it has reached the given number of - * pointers - * - * @param PointersToProcess - * Number of pointers to stop reading the lexicon after - * @param blexiconStream - * the lexicon input stream to read - * @param codesHashMap - * @param tmpStorageStorage - * @return - */ - protected IntLongTuple scanLexiconForPointers(final long PointersToProcess, - final LexiconInputStream blexiconStream, - final TIntIntHashMap codesHashMap, final ArrayList tmpStorageStorage) - throws IOException { - final BlockLexiconInputStream lexiconStream = (BlockLexiconInputStream) blexiconStream; - int processTerms = 0; - long numberOfPointersThisIteration = 0; - long numberOfBlocksThisIteration = 0; - int j = 0; // counter of loop iterations - while (numberOfPointersThisIteration < PointersToProcess) { - - if (lexiconStream.readNextEntry() == -1) - break; - - processTerms++; - - TIntArrayList[] tmpArray = new TIntArrayList[5]; - final int tmpNT = lexiconStream.getNt(); - tmpArray[0] = new TIntArrayList(tmpNT); - tmpArray[1] = new TIntArrayList(tmpNT); - tmpArray[2] = new TIntArrayList(tmpNT); - tmpArray[3] = new TIntArrayList(tmpNT); - tmpArray[4] = new TIntArrayList(lexiconStream.getBlockFrequency()); - numberOfPointersThisIteration += tmpNT; - numberOfBlocksThisIteration += lexiconStream.getBlockFrequency(); - - tmpStorageStorage.add(tmpArray); - - // the class TIntIntHashMap return zero when you look up for a - // the value of a key that does not exist in the hash map. - // For this reason, the values that will be inserted in the - // hash map are increased by one. - codesHashMap.put(lexiconStream.getTermId(), j + 1); - - // increment counter - j++; - } - if(logger.isDebugEnabled()){ - logger.debug(numberOfPointersThisIteration + " pointers == " - + processTerms + " terms == " + numberOfBlocksThisIteration - + " blocks"); - } - return new IntLongTuple(processTerms, numberOfPointersThisIteration); + + protected TIntArrayList[] createPointerForTerm(LexiconEntry le) + { + TIntArrayList[] tmpArray = new TIntArrayList[5]; + final int tmpNT = le.getDocumentFrequency(); + tmpArray[0] = new TIntArrayList(tmpNT); + tmpArray[1] = new TIntArrayList(tmpNT); + tmpArray[2] = new TIntArrayList(tmpNT); + tmpArray[3] = new TIntArrayList(tmpNT); + tmpArray[4] = new TIntArrayList(((BlockTermStatistics)le).getBlockCount()); + return tmpArray; } - /** - * Iterates through the lexicon, until it has reached the given number of - * terms - * - * @param processTerms - * Number of terms to stop reading the lexicon after - * @param blexiconStream - * the lexicon input stream to read - * @param codesHashMap - * @param tmpStorageStorage - * @return - */ - protected IntLongTuple scanLexiconForTerms(final int processTerms, - final LexiconInputStream blexiconStream, - final TIntIntHashMap codesHashMap, TIntArrayList[][] tmpStorage) - throws IOException { - final BlockLexiconInputStream lexiconStream = (BlockLexiconInputStream) blexiconStream; - int j = 0; - long numberOfBlocksThisIteration = 0; - long numberOfPointersThisIteration = 0; - for (; j < processTerms; j++) { - - if (lexiconStream.readNextEntry() == -1) - break; - - TIntArrayList[] tmpArray = new TIntArrayList[5]; - final int tmpNT = lexiconStream.getNt(); - tmpArray[0] = new TIntArrayList(tmpNT); - tmpArray[1] = new TIntArrayList(tmpNT); - tmpArray[2] = new TIntArrayList(tmpNT); - tmpArray[3] = new TIntArrayList(tmpNT); - tmpArray[4] = new TIntArrayList(lexiconStream.getBlockFrequency()); - - numberOfPointersThisIteration += tmpNT; - numberOfBlocksThisIteration += lexiconStream.getBlockFrequency(); - - tmpStorage[j] = tmpArray; - - // the class TIntIntHashMap return zero when you look up for a - // the value of a key that does not exist in the hash map. - // For this reason, the values that will be inserted in the - // hash map are increased by one. - codesHashMap.put(lexiconStream.getTermId(), j + 1); - } - if(logger.isDebugEnabled()){ - logger.debug(numberOfPointersThisIteration + " pointers == " + j - + " terms == " + numberOfBlocksThisIteration + " blocks"); - } - return new IntLongTuple(j, numberOfPointersThisIteration); - } + /** * Traverses the direct fies recording all occurrences of terms noted in @@ -481,10 +355,7 @@ // scan the direct file //BlockDirectIndexInputStream directInputStream = new BlockDirectIndexInputStream( // indexPath, indexPrefix); - BlockDirectIndexInputStream directInputStream = - index != null - ? (BlockDirectIndexInputStream)index.getIndexStructureInputStream("direct") - : new BlockDirectIndexInputStream(indexPath, indexPrefix); + BlockDirectIndexInputStream directInputStream = (BlockDirectIndexInputStream)index.getIndexStructureInputStream("direct"); int[][] documentTerms = null; int p = 0; // a document counter; while ((documentTerms = directInputStream.getNextTerms()) != null) { @@ -580,6 +451,9 @@ tmpMatrix[4] = null; tmpMatrix = null; tmpStorage[j] = null; + + dos.writeLong(file.getByteOffset()); + dos.writeByte(file.getBitOffset()); // write the first entry int docid = tmpMatrix0[0]; @@ -615,17 +489,17 @@ blockindex++; } } - long endOffset = file.getByteOffset(); - byte endBitOffset = file.getBitOffset(); - endBitOffset--; - if (endBitOffset < 0 && endOffset > 0) { - endBitOffset = 7; - endOffset--; - } + //long endOffset = file.getByteOffset(); + //byte endBitOffset = file.getBitOffset(); + //endBitOffset--; + //if (endBitOffset < 0 && endOffset > 0) { + // endBitOffset = 7; + // endOffset--; + //} numTokens += frequency; - dos.writeInt(frequency); - dos.writeLong(endOffset); - dos.writeByte(endBitOffset); + //dos.writeInt(frequency); + //dos.writeLong(endOffset); + //dos.writeByte(endBitOffset); // dereference the arrays so they can be destroyed by GC tmpMatrix0 = tmpMatrix1 = tmpMatrix2 = tmpMatrix3 = tmpMatrix4 = null; diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/BlockLexiconBuilder.java src/uk/ac/gla/terrier/structures/indexing/BlockLexiconBuilder.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/BlockLexiconBuilder.java 2009-01-28 20:16:57.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/indexing/BlockLexiconBuilder.java 2009-03-03 14:34:49.000000000 +0000 @@ -26,299 +26,18 @@ * Craig Macdonald */ package uk.ac.gla.terrier.structures.indexing; -import java.io.IOException; -import java.util.Arrays; -import java.util.PriorityQueue; - -import uk.ac.gla.terrier.structures.BlockLexiconInputStream; -import uk.ac.gla.terrier.structures.BlockLexiconOutputStream; import uk.ac.gla.terrier.structures.Index; -import uk.ac.gla.terrier.structures.Lexicon; -import uk.ac.gla.terrier.structures.LexiconInputStream; -import uk.ac.gla.terrier.structures.LexiconOutputStream; -import uk.ac.gla.terrier.utility.ApplicationSetup; /** * Builds a block lexicon using block frequencies. - * @author Douglas Johnson, Vassilis Plachouras & Craig Macdonald + * @author Craig Macdonald * @version $Revision: 1.32 $ */ public class BlockLexiconBuilder extends LexiconBuilder { - - - /** - * A default constructor of the class. The block lexicon is built in the - * default path and file: ApplicationSetup.TERRIER_INDEX_PATH and - * ApplicationSetup.TERRIER_INDEX_PREFIX respectively. - */ - public BlockLexiconBuilder() + public BlockLexiconBuilder(Index i, String _structureName) { - this(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX); + super(i, _structureName, + BlockLexiconMap.class, + "uk.ac.gla.terrier.structures.BlockLexiconEntry"); } - - /** - * Creates an instance of the class, given the path - * to save the final and temporary lexicons. - * @param pathname String the path to save the temporary lexicons. - */ - public BlockLexiconBuilder(String pathname, String prefix) { - super(pathname, prefix); - LexiconMapClass = BlockLexiconMap.class; - lexiconOutputStream = BlockLexiconOutputStream.class; - lexiconInputStream = BlockLexiconInputStream.class; - try{ TempLex = (LexiconMap) LexiconMapClass.newInstance(); } catch (Exception e) {logger.error(e);} - } - - public BlockLexiconBuilder(Index i) - { - super(i); - LexiconMapClass = BlockLexiconMap.class; - lexiconOutputStream = BlockLexiconOutputStream.class; - lexiconInputStream = BlockLexiconInputStream.class; - try{ TempLex = (LexiconMap) LexiconMapClass.newInstance(); } catch (Exception e) {logger.error(e);} - } - - /** - * The method that performs processing of the lexicon after the - * creation of the direct index has been completed. It flushes to - * disk the current temporary lexicon, and it starts the merging - * of the temporary lexicons and the creation of the lexicon index. - */ - public void finishedDirectIndexBuild() - { - logger.info("flushing block lexicon to disk after the direct index completed"); - //only write a temporary lexicon if there are any items in it - if (TempLex.getNumberOfNodes() > 0) - writeTemporaryLexicon(); - TempLex = null; - - //merges the temporary lexicons - if (tempLexFiles.size() > 0) - try{ - merge(tempLexFiles); - - //creates the offsets file - final String lexiconFilename = - indexPath + ApplicationSetup.FILE_SEPARATOR + - indexPrefix + ApplicationSetup.LEXICONSUFFIX; - LexiconInputStream lis = getLexInputStream(lexiconFilename); - createLexiconIndex( - lis, - lis.numberOfEntries(), - /* after inverted index is built, the lexicon will be transformed into a - * normal lexicon, without block frequency */ - Lexicon.lexiconEntryLength - ); - TermCount = lis.numberOfEntries(); - if (index != null) - { - index.addIndexStructure("lexicon", "uk.ac.gla.terrier.structures.BlockLexicon"); - index.addIndexStructureInputStream("lexicon", "uk.ac.gla.terrier.structures.BlockLexiconInputStream"); - index.setIndexProperty("num.Terms", ""+lis.numberOfEntries()); - index.setIndexProperty("num.Pointers", ""+lis.getNumberOfPointersRead()); - } - } catch(IOException ioe){ - logger.error("Indexing failed to merge temporary lexicons to disk. ",ioe); - } - else - logger.warn("No temporary lexicons to merge, skipping"); - } - - /** Merge the two LexiconInputStreams into the given LexiconOutputStream - * @param lis1 First lexicon to be merged - * @param lis2 Second lexicon to be merged - * @param los Lexion to be merged to - */ - protected void mergeTwoLexicons( - LexiconInputStream blis1, - LexiconInputStream blis2, - LexiconOutputStream blos) throws IOException - { - final BlockLexiconInputStream lis1 = (BlockLexiconInputStream)blis1; - final BlockLexiconInputStream lis2 = (BlockLexiconInputStream)blis2; - final BlockLexiconOutputStream los = (BlockLexiconOutputStream)blos; - - boolean hasMore1 = true; - boolean hasMore2 = true; - int termID1 = 0; - int termID2 = 0; - hasMore1 = (lis1.readNextEntry()!=-1); - hasMore2 = (lis2.readNextEntry()!=-1); - String sTerm1 = null; - String sTerm2 = null; - if (hasMore1) { - termID1 = lis1.getTermId(); - sTerm1 = lis1.getTerm(); - } - if (hasMore2) { - termID2 = lis2.getTermId(); - sTerm2 = lis2.getTerm(); - } - while (hasMore1 && hasMore2) { - int compareString = 0; - if (termID1 != termID2) - { - compareString = sTerm1.compareTo(sTerm2); - if (compareString == 0)//, but termids don't match - { - logger.error("Term "+sTerm1+" had two termids ("+ termID1+","+termID2+")"); - } - } - - if (compareString <0) { - los.writeNextEntry(sTerm1, termID1, lis1.getNt(), lis1.getBlockFrequency(), lis1.getTF(), lis1.getEndOffset(), lis1.getEndBitOffset()); - hasMore1 = (lis1.readNextEntry()!=-1); - if (hasMore1) { - termID1 = lis1.getTermId(); - sTerm1 = lis1.getTerm(); - } - } else if (compareString >0) { - los.writeNextEntry(sTerm2, termID2, lis2.getNt(), lis2.getBlockFrequency(), lis2.getTF(), lis2.getEndOffset(), lis2.getEndBitOffset()); - hasMore2 = (lis2.readNextEntry()!=-1); - if (hasMore2) { - termID2 = lis2.getTermId(); - sTerm2 = lis2.getTerm(); - } - } else /*if (compareString == 0)*/ { - los.writeNextEntry( - sTerm1, - termID1, - lis1.getNt() + lis2.getNt(), - lis1.getBlockFrequency() + lis2.getBlockFrequency(), - lis1.getTF() + lis2.getTF(), - 0, //inverted index not built yet - (byte)0 //inverted index not built yet - ); - - hasMore1 = (lis1.readNextEntry()!=-1); - hasMore2 = (lis2.readNextEntry()!=-1); - if (hasMore1) { - termID1 = lis1.getTermId(); - sTerm1 = lis1.getTerm(); - } - if (hasMore2) { - termID2 = lis2.getTermId(); - sTerm2 = lis2.getTerm(); - } - } - } - if (hasMore1) { - lis2.close(); - - while (hasMore1) { - los.writeNextEntry(sTerm1, termID1, lis1.getNt(), lis1.getBlockFrequency(), lis1.getTF(), lis1.getEndOffset(), lis1.getEndBitOffset()); - hasMore1 = (lis1.readNextEntry()!=-1); - if (hasMore1) { - termID1 = lis1.getTermId(); - sTerm1 = lis1.getTerm(); - } - } - - //close input file 1 stream - lis1.close(); - - } else if (hasMore2) { - lis1.close(); - - while (hasMore2) { - los.writeNextEntry(sTerm2, termID2, lis2.getNt(), lis2.getBlockFrequency(), lis2.getTF(), lis2.getEndOffset(), lis2.getEndBitOffset()); - hasMore2 = (lis2.readNextEntry()!=-1); - if (hasMore2) { - termID2 = lis2.getTermId(); - sTerm2 = lis2.getTerm(); - } - } - //close input file 2 stream - lis2.close(); - } - //closing ouptut lexicon stream - los.close(); - } - - protected void mergeNLexicons(final LexiconInputStream[] _lis, final LexiconOutputStream _los) throws IOException - { - final int numLexicons = _lis.length; - long totalTokens = 0; - long totalPointers = 0; - final int hasMore[] = new int[numLexicons]; - Arrays.fill(hasMore, -1); - final PriorityQueue terms = new PriorityQueue(numLexicons); - final BlockLexiconOutputStream los = (BlockLexiconOutputStream)_los; - final BlockLexiconInputStream[] lis = new BlockLexiconInputStream[numLexicons]; - - for(int i=0;i 0) - { - //what term are we working on - targetTerm = terms.poll(); - //logger.debug("Current term is "+targetTerm + "length="+targetTerm.length()); - //for each input lexicon - for(int i=0;i lexiconStream) throws IOException { + final String[] terms = tfs.keys(new String[0]); Arrays.sort(terms); for (String t : terms) { - lexiconStream.writeNextEntry(t, TermCodes.getCode(t), nts.get(t), tfs.get(t), blockFreqs.get(t), zerol, zerob); + lexiconStream.writeNextEntry(t, new BlockLexiconEntry(TermCodes.getCode(t), nts.get(t), tfs.get(t), zerol, zerob, blockFreqs.get(t))); } } diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/DirectIndexBuilder.java src/uk/ac/gla/terrier/structures/indexing/DirectIndexBuilder.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/DirectIndexBuilder.java 2009-01-28 20:16:57.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/indexing/DirectIndexBuilder.java 2009-03-03 14:34:49.000000000 +0000 @@ -46,9 +46,6 @@ /** The gamma compressed file containing the terms. */ protected BitOut file; - /** The number of documents to be indexed before flushing the data to disk.*/ - protected static final int DocumentsPerFlush = ApplicationSetup.BUNDLE_SIZE; - /** The number of different fields that are used for indexing field information.*/ protected static final int fieldTags = FieldScore.FIELDS_COUNT; @@ -121,13 +118,6 @@ /* find out where we are */ FilePosition rtr = getLastEndOffset(); - /* flush to disk if necessary */ - if (DocumentsSinceFlush++ >= DocumentsPerFlush) - { - flushBuffer(); - resetBuffer(); - DocumentsSinceFlush = 0; - } /* and then return where the position of the last * write to the DirectIndex */ return rtr; @@ -177,8 +167,6 @@ */ public void finishedCollections() { - flushBuffer(); - resetBuffer(); DocumentsSinceFlush = 0; logger.info("flush direct index"); try{ @@ -199,13 +187,7 @@ index.setIndexProperty("num.direct.fields.bits", ""+fieldTags); } } - /** - * Flushes the data to disk. - * @deprecated since 2.0 - */ - public void flushBuffer() { - //file.flush(); - } + /** * Returns the current offset in the direct index. * @return FilePosition the offset in the direct index. @@ -224,13 +206,7 @@ return new FilePosition(endByte, endBit); } - /** - * Resets the internal buffer for writing data. This method should - * be called before adding any documents to the direct index. - */ - public void resetBuffer() { - //file.writeReset(); - } + /** * Closes the underlying gamma compressed file. */ diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/InvertedIndexBuilder.java src/uk/ac/gla/terrier/structures/indexing/InvertedIndexBuilder.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/InvertedIndexBuilder.java 2009-01-28 20:16:58.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/indexing/InvertedIndexBuilder.java 2009-03-03 14:34:49.000000000 +0000 @@ -33,15 +33,22 @@ import java.io.DataOutputStream; import java.io.IOException; import java.util.ArrayList; +import java.util.Iterator; +import java.util.Map; +import org.apache.hadoop.io.Text; import org.apache.log4j.Logger; + import uk.ac.gla.terrier.compression.BitOut; import uk.ac.gla.terrier.compression.BitOutputStream; +import uk.ac.gla.terrier.structures.Closeable; import uk.ac.gla.terrier.structures.DirectIndexInputStream; -import uk.ac.gla.terrier.structures.DocumentIndex; import uk.ac.gla.terrier.structures.Index; -import uk.ac.gla.terrier.structures.LexiconInputStream; +import uk.ac.gla.terrier.structures.LexiconEntry; import uk.ac.gla.terrier.structures.LexiconOutputStream; +import uk.ac.gla.terrier.structures.MapFileLexicon; +import uk.ac.gla.terrier.structures.MapFileLexiconOutputStream; +import uk.ac.gla.terrier.structures.seralization.FixedSizeWriteableFactory; import uk.ac.gla.terrier.utility.ApplicationSetup; import uk.ac.gla.terrier.utility.FieldScore; import uk.ac.gla.terrier.utility.Files; @@ -84,8 +91,6 @@ */ public class InvertedIndexBuilder { - /** class to be used as a lexiconinputstream. set by this and child classes */ - protected Class lexiconInputStream = null; /** class to be used as a lexiconoutpustream. set by this and child classes */ protected Class lexiconOutputStream = null; @@ -104,13 +109,6 @@ } } - /** the directory in which index files should be created */ - protected String indexPath; - /** the first part of the filename component of index files */ - protected String indexPrefix; - - protected String indexPathPrefix; - /** The number of unique terms in the vocabulary.*/ public int numberOfUniqueTerms; @@ -127,6 +125,8 @@ protected final boolean useFieldInformation = FieldScore.USE_FIELD_INFORMATION; protected Index index = null; + + protected String structureName = null; /** The number of pointers to be processed in an interation. This directly corresponds to the * property invertedfile.processpointers. If this property is set and > 0, then each @@ -139,63 +139,21 @@ * The underlying bit file. */ protected BitOut file; - - /** - * Constructor of the class InvertedIndex. - * @deprecated - */ - public InvertedIndexBuilder(String Path, String Prefix) - { - indexPath = Path; indexPrefix = Prefix; - indexPathPrefix = indexPath + ApplicationSetup.FILE_SEPARATOR + indexPrefix; - try{ - file = new BitOutputStream(indexPathPrefix + ApplicationSetup.IFSUFFIX); - } catch (IOException ioe) { - logger.error("creating BitOutputStream for writing the inverted file : ", ioe); - } - lexiconInputStream = LexiconInputStream.class; - lexiconOutputStream = LexiconOutputStream.class; - } - public InvertedIndexBuilder(Index i) + public InvertedIndexBuilder(Index i, String _structureName) { this.index = i; - indexPath = index.getPath(); indexPrefix = index.getPrefix(); - indexPathPrefix = indexPath + ApplicationSetup.FILE_SEPARATOR + indexPrefix; + this.structureName = _structureName; + try{ - file = new BitOutputStream(indexPathPrefix + ApplicationSetup.IFSUFFIX); + file = new BitOutputStream(index.getPath() + "/"+ index.getPrefix() + "." +structureName + ".bf"); } catch (IOException ioe) { logger.error("creating BitOutputStream for writing the inverted file : ", ioe); } - lexiconInputStream = LexiconInputStream.class; lexiconOutputStream = LexiconOutputStream.class; } - /** - * A default constructor of the class InvertedIndex. - * @deprecated - */ - public InvertedIndexBuilder() { - this(ApplicationSetup.TERRIER_INDEX_PATH, - ApplicationSetup.TERRIER_INDEX_PREFIX); - } - - /** - * Creates an instance of the InvertedIndex - * class using the given filename. - * @param filename The name of the inverted file - * @deprecated Use this() or this(String, String) - */ - public InvertedIndexBuilder(String filename) { - try{ - file = new BitOutputStream(filename); - } catch (IOException ioe) { - logger.error("Creating BitOutputStream for writing the direct file : ", ioe); - } - lexiconInputStream = LexiconInputStream.class; - lexiconOutputStream = LexiconOutputStream.class; - } /** * Closes the underlying bit file. @@ -208,21 +166,22 @@ * Creates the inverted index using the already created direct index, * document index and lexicon. */ + @SuppressWarnings("unchecked") public void createInvertedIndex() { try { Runtime r = Runtime.getRuntime(); logger.debug("creating inverted index"); - final String LexiconFilename = indexPathPrefix + ApplicationSetup.LEXICONSUFFIX; + final String LexiconFilename = index.getPath() + "/" + index.getPrefix() + ".lexicon"; final int numberOfDocuments = index.getCollectionStatistics().getNumberOfDocuments(); long assumedNumberOfPointers = Long.parseLong(index.getIndexProperty("num.Pointers", "0")); long numberOfTokens = 0; long numberOfPointers = 0; - - LexiconInputStream lexiconStream = getLexInputStream(LexiconFilename); - numberOfUniqueTerms = lexiconStream.numberOfEntries(); - final int fieldsCount = FieldScore.FIELDS_COUNT; + int numberOfUniqueTerms = index.getLexicon().numberOfEntries(); + Iterator> lexiconStream = + (Iterator>)index.getIndexStructureInputStream("lexicon"); + //A temporary file for storing the updated lexicon file, after // creating the inverted file DataOutputStream dos = new DataOutputStream(Files.writeFileStream(LexiconFilename.concat(".tmp2"))); @@ -374,52 +333,56 @@ this.numberOfUniqueTerms = numberOfUniqueTerms; this.numberOfPointers = numberOfPointers; - lexiconStream.close(); + if (lexiconStream instanceof Closeable) { + ((Closeable)lexiconStream).close(); + } dos.close(); //finalising the lexicon file with the updated information //on the frequencies and the offsets - //reading the original lexicon - LexiconInputStream lis = getLexInputStream(LexiconFilename); + lexiconStream = (Iterator>)index.getIndexStructureInputStream("lexicon"); + //the updated lexicon - LexiconOutputStream los = getLexOutputStream(LexiconFilename.concat(".tmp3")); + LexiconOutputStream los = getLexOutputStream("tmplexicon"); //the temporary data containing the offsets DataInputStream dis = new DataInputStream(Files.openFileStream(LexiconFilename.concat(".tmp2"))); - while (lis.readNextEntryBytes() != -1) { - los.writeNextEntry(lis.getTermCharacters(), lis.getTermId(), - lis.getNt(), - dis.readInt(), //the term frequency - dis.readLong(), //end byte offset - dis.readByte());//end bit offset + while(lexiconStream.hasNext()) + { + Map.Entry lee = lexiconStream.next(); + LexiconEntry value = lee.getValue(); + value.setPosition(dis.readLong(), dis.readByte()); + los.writeNextEntry(lee.getKey(), value); + } + if (lexiconStream instanceof Closeable) { + ((Closeable)lexiconStream).close(); } - lis.close(); los.close(); dis.close(); - if (! Files.delete(LexiconFilename)) - logger.error("delete file .lex failed!"); - if (! Files.delete(LexiconFilename.concat(".tmp2"))) - logger.error("delete file .lex.tmp2 failed!"); - if (! Files.rename(LexiconFilename.concat(".tmp3"), LexiconFilename)) - logger.error("rename file .lex.tmp3 to .lex failed!"); + Files.delete(LexiconFilename.concat(".tmp2")); + MapFileLexicon.deleteMapFileLexicon("lexicon", index.getPath(), index.getPrefix()); + MapFileLexicon.renameMapFileLexicon( + "tmplexicon", index.getPath(), index.getPrefix(), + "lexicon", index.getPath(), index.getPrefix()); index.addIndexStructure( - "inverted", + structureName, "uk.ac.gla.terrier.structures.InvertedIndex", - "uk.ac.gla.terrier.structures.Lexicon,java.lang.String,java.lang.String", - "lexicon,path,prefix"); + "uk.ac.gla.terrier.structures.Index,java.lang.String", + "index,structureName"); index.addIndexStructureInputStream( - "inverted", + structureName, "uk.ac.gla.terrier.structures.InvertedIndexInputStream", - "java.lang.String,java.lang.String,uk.ac.gla.terrier.structures.LexiconInputStream", - "path,prefix,lexicon-inputstream"); + "uk.ac.gla.terrier.structures.Index,java.lang.String,java.util.Iterator", + "index,structureName,lexicon-inputstream"); index.setIndexProperty("num.inverted.fields.bits", ""+FieldScore.FIELDS_COUNT ); //should be already set, but in case their not index.setIndexProperty("num.Terms", ""+numberOfUniqueTerms); index.setIndexProperty("num.Tokens", ""+numberOfTokens); index.setIndexProperty("num.Pointers", ""+numberOfPointers); + index.flush(); System.gc(); } catch (IOException ioe) { @@ -427,6 +390,16 @@ } } + protected TIntArrayList[] createPointerForTerm(LexiconEntry le) + { + TIntArrayList[] tmpArray = new TIntArrayList[3]; + final int tmpNT = le.getDocumentFrequency(); + tmpArray[0] = new TIntArrayList(tmpNT); + tmpArray[1] = new TIntArrayList(tmpNT); + tmpArray[2] = new TIntArrayList(tmpNT); + return tmpArray; + } + /** Iterates through the lexicon, until it has reached the given number of pointers * @param PointersToProcess Number of pointers to stop reading the lexicon after * @param lexiconStream the lexicon input stream to read @@ -436,7 +409,7 @@ */ protected IntLongTuple scanLexiconForPointers( final long PointersToProcess, - final LexiconInputStream lexiconStream, + final Iterator> lexiconStream, final TIntIntHashMap codesHashMap, final ArrayList tmpStorageStorage) throws IOException @@ -446,27 +419,21 @@ int j=0; //counter of loop iterations while(numberOfPointersThisIteration < PointersToProcess) { - if (lexiconStream.readNextEntry() == -1) + if (! lexiconStream.hasNext()) break; - processTerms++; + Map.Entry lee = lexiconStream.next(); + LexiconEntry le = lee.getValue(); - TIntArrayList[] tmpArray = new TIntArrayList[3]; - final int tmpNT = lexiconStream.getNt(); - tmpArray[0] = new TIntArrayList(tmpNT); - tmpArray[1] = new TIntArrayList(tmpNT); - tmpArray[2] = new TIntArrayList(tmpNT); - - numberOfPointersThisIteration += tmpNT; - - - tmpStorageStorage.add(tmpArray); + processTerms++; + numberOfPointersThisIteration += le.getDocumentFrequency(); + tmpStorageStorage.add(createPointerForTerm(le)); //the class TIntIntHashMap return zero when you look up for a //the value of a key that does not exist in the hash map. //For this reason, the values that will be inserted in the //hash map are increased by one. - codesHashMap.put(lexiconStream.getTermId(), j + 1); + codesHashMap.put(le.getTermId(), j + 1); //increment counter j++; @@ -488,7 +455,7 @@ */ protected IntLongTuple scanLexiconForTerms( final int processTerms, - final LexiconInputStream lexiconStream, + final Iterator> lexiconStream, final TIntIntHashMap codesHashMap, TIntArrayList[][] tmpStorage) throws IOException @@ -498,11 +465,14 @@ long numberOfPointersThisIteration = 0; for (; j < processTerms; j++) { - if (lexiconStream.readNextEntry() == -1) + if (! lexiconStream.hasNext()) break; + + Map.Entry lee = lexiconStream.next(); + LexiconEntry le = lee.getValue(); TIntArrayList[] tmpArray = new TIntArrayList[3]; - final int tmpNT = lexiconStream.getNt(); + final int tmpNT = le.getDocumentFrequency(); tmpArray[0] = new TIntArrayList(tmpNT); tmpArray[1] = new TIntArrayList(tmpNT); tmpArray[2] = new TIntArrayList(tmpNT); @@ -516,7 +486,7 @@ //the value of a key that does not exist in the hash map. //For this reason, the values that will be inserted in the //hash map are increased by one. - codesHashMap.put(lexiconStream.getTermId(), j + 1); + codesHashMap.put(le.getTermId(), j + 1); } if (logger.isDebugEnabled()) logger.debug( @@ -538,10 +508,7 @@ throws IOException { //scan the direct file - DirectIndexInputStream directInputStream = - index != null - ? (DirectIndexInputStream)index.getIndexStructureInputStream("direct") - : new DirectIndexInputStream(indexPath, indexPrefix); + DirectIndexInputStream directInputStream = (DirectIndexInputStream)index.getIndexStructureInputStream("direct"); int[][] documentTerms = null; int p = 0; //a document counter; final boolean useFieldInformation = this.useFieldInformation; @@ -582,8 +549,7 @@ protected void traverseDirectFile(int[][][] tmpStorage, int[] indices, TIntIntHashMap codesHashMap) throws IOException { - DirectIndexInputStream directInputStream = new DirectIndexInputStream( - indexPath, indexPrefix); + DirectIndexInputStream directInputStream = (DirectIndexInputStream)index.getIndexStructureInputStream("direct"); int[][] documentTerms = null; int[] documentTerms0 = null; int[] documentTerms1 = null; @@ -652,14 +618,13 @@ throws IOException { //write to the inverted file. We should note that the lexicon - //file should be updated as well with the term frequency and - //the endOffset and endBitOffset. - - //remove this, as it now happens at the end of this method - //the first call is made at the start of createInvertedIndex - //file.writeReset(); + //should be updated with the start bit and byte offset for this + //set of postings. int frequency; long numTokens = 0; for (int j = 0; j < processTerms; j++) { + dos.writeLong(file.getByteOffset()); + dos.writeByte(file.getBitOffset()); + frequency = 0; //the term frequency TIntArrayList[] tmpMatrix = tmpStorage[j]; final int[] tmpMatrix0 = tmpMatrix[0].toNativeArray(); @@ -705,21 +670,17 @@ } } - long endOffset = file.getByteOffset(); - byte endBitOffset = file.getBitOffset(); - endBitOffset--; - if (endBitOffset < 0 && endOffset > 0) { - endBitOffset = 7; - endOffset--; - } + //long endOffset = file.getByteOffset(); + //byte endBitOffset = file.getBitOffset(); + //endBitOffset--; + //if (endBitOffset < 0 && endOffset > 0) { + // endBitOffset = 7; + // endOffset--; + //} numTokens += frequency; - dos.writeInt(frequency); - dos.writeLong(endOffset); - dos.writeByte(endBitOffset); + //dos.writeInt(frequency); + } - //file.writeFlush(); - //we have to force a reset here, as otherwise the buffer isn't cleared. - //file.writeReset(); return numTokens; } @@ -734,124 +695,6 @@ */ protected int processTerms = Integer.parseInt(ApplicationSetup.getProperty("invertedfile.processterms", "75000")); - /* - for (int i = 0; i < numberOfUniqueTerms; i = i + processTerms) { - //set the number of terms to process from the lexicon - if ((i + processTerms) > numberOfUniqueTerms) - processTerms = (int) numberOfUniqueTerms - i; - //start processing part of the lexicon - startProcessingLexicon = System.currentTimeMillis(); - //preparing the data structures to store the data - int[] indices = new int[processTerms]; - int[][][] tmpStorage = new int[processTerms][][]; - TIntIntHashMap codesHashMap = new TIntIntHashMap(processTerms); - int numberOfPointersPerIteration = 0; - - int numOfFields = 2; - if (useFieldInformation) - numOfFields = 3; - - for (int j = 0; j < processTerms; j++) { - lexiconStream.readNextEntry(); - //int[][] tmpArray = new int[numOfFields][lexiconStream.getNt()]; - numberOfPointersPerIteration += lexiconStream.getNt(); - //tmpStorage.add(tmpArray); - tmpStorage[j] = new int[numOfFields][lexiconStream.getNt()]; - //the class TIntIntHashMap return zero when you look up for - // a the value of a key that does not exist in the hash map. - //For this reason, the values that will be inserted in the - //hash map are increased by one. - codesHashMap.put(lexiconStream.getTermId(), j + 1); - } - numberOfPointers += numberOfPointersPerIteration; - endProcessingLexicon = System.currentTimeMillis(); - startTraversingDirectFile = System.currentTimeMillis(); - //scan the direct file - //uses indices, tmpStorage and codesHashMap - traverseDirectFile(tmpStorage, indices, codesHashMap); - //end of traversing the - endTraversingDirectFile = System.currentTimeMillis(); - startWritingInvertedFile = System.currentTimeMillis(); - //write to the inverted file. We should note that the lexicon - //file should be updated as well with the term frequency and - //the endOffset and endBitOffset. - //file.writeReset(); - int frequency; - int[][] tmpMatrix = null; - int[] tmpMatrix0 = null; - int[] tmpMatrix1 = null; - - for (int j = 0; j < processTerms; j++) { - frequency = 0; //the term frequency - //tmpMatrix = (int[][]) tmpStorage.elementAt(j); - tmpMatrix = tmpStorage[j]; - tmpMatrix0 = tmpMatrix[0]; - tmpMatrix1 = tmpMatrix[1]; - - //we do not need to sort because the documents are read in - //order of docid, and therefore the arrays are already - // sorted. - if (useFieldInformation) { - int[] tmpMatrix2 = tmpMatrix[2]; - //write the first entry - file.writeGamma(tmpMatrix0[0] + 1); - frequency += tmpMatrix1[0]; - file.writeUnary(tmpMatrix1[0]); - file.writeBinary(fieldsCount, tmpMatrix2[0]); - final int tmpMatrix0Length = tmpMatrix0.length; - for (int k = 1; k < tmpMatrix0Length; k++) { - file.writeGamma(tmpMatrix0[k] - tmpMatrix0[k - 1]); - frequency += tmpMatrix1[k]; - file.writeUnary(tmpMatrix1[k]); - file.writeBinary(fieldsCount, tmpMatrix2[k]); - } - } else { - //write the first entry - file.writeGamma(tmpMatrix0[0] + 1); - frequency += tmpMatrix1[0]; - file.writeUnary(tmpMatrix1[0]); - final int tmpMatrix0Length = tmpMatrix0.length; - for (int k = 1; k < tmpMatrix0Length; k++) { - file.writeGamma(tmpMatrix0[k] - tmpMatrix0[k - 1]); - frequency += tmpMatrix1[k]; - file.writeUnary(tmpMatrix1[k]); - } - } - - long endOffset = file.getByteOffset(); - byte endBitOffset = file.getBitOffset(); - endBitOffset--; - if (endBitOffset < 0 && endOffset > 0) { - endBitOffset = 7; - endOffset--; - } - numberOfTokens += frequency; - dos.writeInt(frequency); - dos.writeLong(endOffset); - dos.writeByte(endBitOffset); - } - //file.writeFlush(); - endWritingInvertedFile = System.currentTimeMillis(); - - System.err.println("time to process part of lexicon: " - + ((endProcessingLexicon - startProcessingLexicon) / 1000D)); - System.err.println("time to traverse direct file: " - + ((endTraversingDirectFile - startTraversingDirectFile) / 1000D)); - System.err.println("time to write inverted file: " - + ((endWritingInvertedFile - startWritingInvertedFile) / 1000D)); - System.err.println("time to perform one iteration: " - + ((endWritingInvertedFile - startProcessingLexicon) / 1000D)); - System.err.println("number of pointers processed: " - + numberOfPointersPerIteration); - - indices = null; - tmpStorage = null; - codesHashMap.clear(); - codesHashMap = null; - - } - */ - public static void displayMemoryUsage(Runtime r) { if (logger.isDebugEnabled()) @@ -862,26 +705,14 @@ ); } - public LexiconInputStream getLexInputStream(String filename) - { - LexiconInputStream li = null; - try{ - li = (LexiconInputStream) lexiconInputStream.getConstructor(String.class).newInstance(filename); - } catch (Exception e) { - logger.error("Problem loading a LexiconInputStream", e); - } - return li; - } - public LexiconOutputStream getLexOutputStream(String filename) + @SuppressWarnings("unchecked") + protected LexiconOutputStream getLexOutputStream(String structureName) throws IOException { - LexiconOutputStream lo = null; - try{ - lo = (LexiconOutputStream) lexiconOutputStream.getConstructor(String.class).newInstance(filename); - } catch (Exception e) { - logger.error("Problem loading a LexiconOutputStream", e); - } - return lo; + return new MapFileLexiconOutputStream( + index.getPath(), index.getPrefix(), + structureName, + (FixedSizeWriteableFactory)index.getIndexStructure("lexicon-keyfactory")); } } diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/LexiconBuilder.java src/uk/ac/gla/terrier/structures/indexing/LexiconBuilder.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/LexiconBuilder.java 2009-01-28 20:16:58.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/indexing/LexiconBuilder.java 2009-03-03 14:34:49.000000000 +0000 @@ -25,27 +25,24 @@ * Vassilis Plachouras */ package uk.ac.gla.terrier.structures.indexing; -import gnu.trove.TIntObjectHashMap; - -import java.io.DataOutputStream; import java.io.IOException; -import java.io.ObjectOutputStream; -import java.io.OutputStream; import java.util.Arrays; -import java.util.HashSet; +import java.util.Iterator; import java.util.LinkedList; +import java.util.Map; import java.util.PriorityQueue; -import java.util.Set; +import org.apache.hadoop.io.Text; import org.apache.log4j.Logger; +import uk.ac.gla.terrier.structures.Closeable; import uk.ac.gla.terrier.structures.Index; -import uk.ac.gla.terrier.structures.Lexicon; -import uk.ac.gla.terrier.structures.LexiconInputStream; +import uk.ac.gla.terrier.structures.LexiconEntry; import uk.ac.gla.terrier.structures.LexiconOutputStream; -import uk.ac.gla.terrier.structures.UTFLexiconInputStream; +import uk.ac.gla.terrier.structures.MapFileLexicon; +import uk.ac.gla.terrier.structures.MapFileLexiconOutputStream; +import uk.ac.gla.terrier.structures.seralization.FixedSizeWriteableFactory; import uk.ac.gla.terrier.utility.ApplicationSetup; -import uk.ac.gla.terrier.utility.Files; /** * Builds temporary lexicons during indexing a collection and * merges them when the indexing of a collection has finished. @@ -54,12 +51,13 @@ */ public class LexiconBuilder { - /** class to be used as a lexiconinputstream. set by this and child classes */ - protected Class lexiconInputStream = null; + /** class to be used as a lexiconoutpustream. set by this and child classes */ - protected Class lexiconOutputStream = null; + protected Class lexiconOutputStream = null; - protected Class LexiconMapClass = null; + protected Class LexiconMapClass = null; + + protected final String lexiconEntryFactoryValueClass; /** The logger used for this class */ protected static Logger logger = Logger.getRootLogger(); @@ -70,12 +68,10 @@ /** How many terms are in the final lexicon */ protected int TermCount = 0; - /** - * The number of documents for which a temporary lexicon - * is created. - */ - protected static final int DocumentsPerLexicon = ApplicationSetup.BUNDLE_SIZE; - /** The linkedlist in which the temporary lexicon filenames are stored. + /** The number of documents for which a temporary lexicon is created. + * Corresponds to property bundle.size, default value 2000. */ + protected static final int DocumentsPerLexicon = Integer.parseInt(ApplicationSetup.getProperty("bundle.size", "2000")); + /** The linkedlist in which the temporary lexicon structure names are stored. * These are merged into a single Lexicon by the merge() method. * LinkedList is best List implementation for this, as all operations * are either append element, or remove first element - making LinkedList @@ -86,7 +82,7 @@ protected LexiconMap TempLex; /** The directory to write temporary lexicons to */ - protected String TemporaryLexiconDirectory = null; + //protected String TemporaryLexiconDirectory = null; /** The directory to write the final lexicons to */ protected String indexPath = null; @@ -99,51 +95,83 @@ protected int TempLexCount = 0; /** How many temporary directories have been generated so far */ - protected int TempLexDirCount = 0; + //protected int TempLexDirCount = 0; /** How many temporary lexicons per temporary directory. Set from the property lexicon.builder.templexperdir, default 100 */ - protected static final int TempLexPerDir = Integer.parseInt(ApplicationSetup.getProperty("lexicon.builder.templexperdir", "100")); + //protected static final int TempLexPerDir = Integer.parseInt(ApplicationSetup.getProperty("lexicon.builder.templexperdir", "100")); /** Should we only merge lexicons in pairs (Terrier 1.0.x scheme)? Set by property lexicon.builder.merge.2lex.attime */ protected static final boolean MERGE2LEXATTIME = Boolean.parseBoolean(ApplicationSetup.getProperty("lexicon.builder.merge.2lex.attime", "false")); /** Number of lexicons to merge at once. Set by property lexicon.builder.merge.lex.max, defaults to 16 */ protected static final int MAXLEXMERGE = Integer.parseInt(ApplicationSetup.getProperty("lexicon.builder.merge.lex.max", "16")); - - /** - * A default constructor of the class. The lexicon is built in the - * default path and file: ApplicationSetup.TERRIER_INDEX_PATH and - * ApplicationSetup.TERRIER_INDEX_PREFIX respectively. - * @deprecated - */ - public LexiconBuilder() + + public interface CollectionStaticticsCounter extends Closeable { - this(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX); + public void count(V value); } - public LexiconBuilder(Index i) { + static class BasicLexiconCollectionStaticticsCounter + implements CollectionStaticticsCounter + { + long numberOfTokens = 0; + int numberOfTerms = 0; + long numberOfPointers = 0; + final Index index; + public BasicLexiconCollectionStaticticsCounter(Index _index) + { + index = _index; + } + + public void count(LexiconEntry value) + { + numberOfTokens += value.getFrequency(); + numberOfPointers += value.getDocumentFrequency(); + numberOfTerms++; + } + + public void close() + { + if (index != null) + { + index.setIndexProperty("num.Terms", ""+numberOfTerms); + index.setIndexProperty("num.Tokens", ""+numberOfTokens); + index.setIndexProperty("num.Pointers", ""+numberOfPointers); + } + } + } + + protected String defaultStructureName; + protected FixedSizeWriteableFactory valueFactory; + + + public LexiconBuilder(Index i, String _structureName) { + this(i, _structureName, + LexiconMap.class, "uk.ac.gla.terrier.structures.BasicLexiconEntry"); + } + + @SuppressWarnings("unchecked") + protected LexiconBuilder(Index i, String _structureName, + Class _LexiconMapClass, + String _lexiconEntryClass) + { this.index = i; this.indexPath = index.getPath(); this.indexPrefix = index.getPrefix(); - TemporaryLexiconDirectory = indexPath + ApplicationSetup.FILE_SEPARATOR + indexPrefix + "_"; - LexiconMapClass = LexiconMap.class; + this.defaultStructureName = _structureName; + //TemporaryLexiconDirectory = indexPath + ApplicationSetup.FILE_SEPARATOR + indexPrefix + "_"; + LexiconMapClass = _LexiconMapClass; + lexiconEntryFactoryValueClass = _lexiconEntryClass; try{ TempLex = (LexiconMap) LexiconMapClass.newInstance(); } catch (Exception e) {logger.error(e);} - lexiconInputStream = LexiconInputStream.class; - lexiconOutputStream = LexiconOutputStream.class; - } - - /** - * Creates an instance of the class, given the path - * to save the temporary lexicons. - * @param pathname String the path to save the temporary lexicons. - */ - public LexiconBuilder(String pathname, String prefix) { - indexPath = pathname; - indexPrefix = prefix; - TemporaryLexiconDirectory = pathname + ApplicationSetup.FILE_SEPARATOR + prefix + "_"; - LexiconMapClass = LexiconMap.class; - try{ TempLex = (LexiconMap) LexiconMapClass.newInstance(); } catch (Exception e) {logger.error(e);} - lexiconInputStream = LexiconInputStream.class; + + this.index.addIndexStructure( + defaultStructureName+"-keyfactory", + "uk.ac.gla.terrier.structures.seralization.FixedSizeTextFactory", + "java.lang.String", + "${max.term.length}" + ); + this.index.addIndexStructure(defaultStructureName+"-valuefactory", lexiconEntryFactoryValueClass+"$Factory", "", ""); + valueFactory = (FixedSizeWriteableFactory)this.index.getIndexStructure(defaultStructureName+"-valuefactory"); lexiconOutputStream = LexiconOutputStream.class; } @@ -155,9 +183,11 @@ /** If the application code generated lexicons itself, use this method to add them to the merge list * Otherwise dont touch this method. - * @param filename Fully path to a lexicon to merge */ - public void addTemporaryLexicon(String filename) { - filename = ApplicationSetup.makeAbsolute(filename, TemporaryLexiconDirectory); + * @param structureName Fully path to a lexicon to merge + * @deprecated */ + public void addTemporaryLexicon(String structureName) { + tempLexFiles.addLast(structureName); + //filename = ApplicationSetup.makeAbsolute(filename, TemporaryLexiconDirectory); } /** Writes the current contents of TempLex temporary lexicon binary tree down to @@ -166,19 +196,21 @@ protected void writeTemporaryLexicon() { try{ - TempLexDirCount = TempLexCount / TempLexPerDir; - if (! Files.exists(TemporaryLexiconDirectory + TempLexDirCount)) { - String tmpDir = TemporaryLexiconDirectory + TempLexDirCount; - Files.mkdir(tmpDir); - Files.deleteOnExit(tmpDir);//it's fine to mark the temporary *directory* for deletion - } - String tmpLexName = TemporaryLexiconDirectory + TempLexDirCount + ApplicationSetup.FILE_SEPARATOR + - (TempLexCount) + ApplicationSetup.LEXICONSUFFIX; - LexiconOutputStream los = getLexOutputStream(tmpLexName); + //TempLexDirCount = TempLexCount / TempLexPerDir; + //if (! Files.exists(TemporaryLexiconDirectory + TempLexDirCount)) { + // String tmpDir = TemporaryLexiconDirectory + TempLexDirCount; + // Files.mkdir(tmpDir); + // Files.deleteOnExit(tmpDir);//it's fine to mark the temporary *directory* for deletion + //} + //String tmpLexName = TemporaryLexiconDirectory + TempLexDirCount + ApplicationSetup.FILE_SEPARATOR + TempLexCount; + //LexiconOutputStream los = getLexOutputStream(TempLexDirCount+""+TempLexCount); + final String tmpLexName = this.defaultStructureName+"-tmp"+ TempLexCount; + LexiconOutputStream los = getLexOutputStream(tmpLexName); TempLex.storeToStream(los); los.close(); /* An alternative but deprecated method to store the temporary lexicons is: * TempLex.storeToFile(tmpLexName); */ + //tempLexFiles.addLast(TempLexDirCount+""+TempLexCount); tempLexFiles.addLast(tmpLexName); }catch(IOException ioe){ logger.error("Indexing failed to write a lexicon to disk : ", ioe); @@ -224,15 +256,7 @@ * inverted index. */ public void finishedInvertedIndexBuild() { - if (Boolean.parseBoolean(ApplicationSetup.getProperty("lexicon.use.hash","true"))) { - String lexiconFilename = indexPath + ApplicationSetup.FILE_SEPARATOR + indexPrefix + ApplicationSetup.LEXICONSUFFIX; - LexiconInputStream lexStream = getLexInputStream(lexiconFilename); - this.createLexiconHash(lexStream); - } - if (index != null) - { - index.addIndexStructure("lexicon", "uk.ac.gla.terrier.structure.Lexicon"); - } + LexiconBuilder.optimise(index, defaultStructureName); } /** @@ -251,40 +275,23 @@ //merges the temporary lexicons if (tempLexFiles.size() > 0) { - Set tempDirectories = new HashSet(); - for(String tmpLex : tempLexFiles) - { - tempDirectories.add(Files.getParent(tmpLex)); - } + //Set tempDirectories = new HashSet(); + //for(String tmpLex : tempLexFiles) + //{ + // tempDirectories.add(Files.getParent(tmpLex)); + //} try{ merge(tempLexFiles); - - //creates the offsets file - final String lexiconFilename = - indexPath + ApplicationSetup.FILE_SEPARATOR + - indexPrefix + ApplicationSetup.LEXICONSUFFIX; - LexiconInputStream lis = getLexInputStream(lexiconFilename); - createLexiconIndex( - lis, - lis.numberOfEntries(), - Lexicon.lexiconEntryLength - ); - TermCount = lis.numberOfEntries(); - if (index != null) - { - index.addIndexStructure("lexicon", "uk.ac.gla.terrier.structures.Lexicon"); - index.addIndexStructureInputStream("lexicon", "uk.ac.gla.terrier.structures.LexiconInputStream"); - index.setIndexProperty("num.Terms", ""+lis.numberOfEntries()); - index.setIndexProperty("num.Pointers", ""+lis.getNumberOfPointersRead()); - } + //creates the offsets and hash file + LexiconBuilder.optimise(index, defaultStructureName); } catch(IOException ioe){ logger.error("Indexing failed to merge temporary lexicons to disk : ", ioe); } - for (String tmpDir : tempDirectories) - { - Files.delete(tmpDir); - } + //for (String tmpDir : tempDirectories) + //{ + // Files.delete(tmpDir); + //} } else logger.warn("No temporary lexicons to merge, skipping"); @@ -297,6 +304,7 @@ * @throws IOException an input/output exception is throws * if a problem is encountered. */ + @SuppressWarnings("unchecked") public void merge(LinkedList filesToMerge) throws IOException { //now the merging of the files in the filesToMerge vector //must take place. @@ -318,7 +326,8 @@ } if (StartFileCount == 1) { - Files.rename(filesToMerge.removeFirst(), indexPath + ApplicationSetup.FILE_SEPARATOR +indexPrefix + ApplicationSetup.LEXICONSUFFIX); + MapFileLexicon.renameMapFileLexicon(filesToMerge.removeFirst(), index.getPath(), index.getPrefix(), + defaultStructureName, index.getPath(), index.getPrefix()); } else if (MERGE2LEXATTIME) { @@ -326,7 +335,7 @@ if (logger.isDebugEnabled()) logger.debug("begin merging "+ StartFileCount +" temporary lexicons, in pairs..."); long startTime = System.currentTimeMillis(); - int progressiveNumber = ApplicationSetup.MERGE_TEMP_NUMBER; + int progressiveNumber = 0; String newMergedFile = null; while (filesToMerge.size() > 1) { String fileToMerge1 = (String) filesToMerge.removeFirst(); @@ -334,23 +343,17 @@ //give the proper name to the final merged lexicon if (filesToMerge.size() == 0) - newMergedFile = indexPath + ApplicationSetup.FILE_SEPARATOR + - indexPrefix + ApplicationSetup.LEXICONSUFFIX; + newMergedFile = defaultStructureName; else - newMergedFile = - Files.getParent(fileToMerge1) - + ApplicationSetup.FILE_SEPARATOR - + ApplicationSetup.MERGE_PREFIX - + String.valueOf(progressiveNumber++) - + ApplicationSetup.LEXICONSUFFIX; + newMergedFile = defaultStructureName + "-mergetmp"+ String.valueOf(progressiveNumber++); //The opening of the files needs to break into more steps, so that //all the open streams are closed after the completion of the //operation, and eventually the intermediate files are deleted. - LexiconInputStream lis1 = getLexInputStream(fileToMerge1); - LexiconInputStream lis2 = getLexInputStream(fileToMerge2); - LexiconOutputStream los = getLexOutputStream(newMergedFile); + Iterator> lis1 = getLexInputStream(fileToMerge1); + Iterator> lis2 = getLexInputStream(fileToMerge2); + LexiconOutputStream los = getLexOutputStream(newMergedFile); if (logger.isDebugEnabled()) logger.debug( @@ -364,8 +367,8 @@ mergeTwoLexicons(lis1, lis2, los); //delete the two files just merged - Files.delete(fileToMerge1); - Files.delete(fileToMerge2); + MapFileLexicon.deleteMapFileLexicon(fileToMerge1, indexPath, indexPrefix); + MapFileLexicon.deleteMapFileLexicon(fileToMerge2, indexPath, indexPrefix); filesToMerge.addLast(newMergedFile); } long endTime = System.currentTimeMillis(); @@ -377,7 +380,7 @@ if (logger.isDebugEnabled()) logger.debug("begin merging "+ StartFileCount +" files in batches of upto "+mergeNMaxLexicon+"..."); long startTime = System.currentTimeMillis(); - int progressiveNumber = ApplicationSetup.MERGE_TEMP_NUMBER; + int progressiveNumber = 0; while (filesToMerge.size() > 1) @@ -386,7 +389,7 @@ if (logger.isDebugEnabled()) logger.debug("merging "+ numLexicons + " temporary lexicons"); final String inputLexiconFileNames[] = new String[numLexicons]; - final LexiconInputStream[] lis = new LexiconInputStream[numLexicons]; + final Iterator>[] lis = (Iterator>[])new Iterator[numLexicons]; for(int i=0;i los = getLexOutputStream(newMergedFile); mergeNLexicons(lis, los); - for(int i=0;i>[] lis = + (Iterator>[]) new Iterator[StartFileCount]; for(int i=0;i los = getLexOutputStream(defaultStructureName); mergeNLexicons(lis, los); for(int i=0;i>[] lis, LexiconOutputStream los) throws IOException { final int numLexicons = lis.length; - long totalTokens = 0; - long totalPointers = 0; - int hasMore[] = new int[numLexicons]; - Arrays.fill(hasMore, -1); + boolean hasMore[] = new boolean[numLexicons]; + Map.Entry[] currentEntries = new Map.Entry[numLexicons]; + + Arrays.fill(hasMore, false); PriorityQueue terms = new PriorityQueue(numLexicons); for(int i=0;i 0) { //what term are we working on @@ -470,24 +486,28 @@ { //does this lexicon contain the term //logger.debug("Checking lexicon "+i+" for "+targetTerm+"="+lis[i].getTerm()); - if(hasMore[i] != -1 && lis[i].getTerm().equals(targetTerm)) + if(hasMore[i] && currentEntries[i].getKey().equals(targetTerm)) { if (targetTermId == -1) { //obtain the termid for this term from the first lexicon that has the term - targetTermId = lis[i].getTermId(); + nextEntryToWrite = newLexiconEntry(targetTermId = currentEntries[i].getValue().getTermId()); } - else if (targetTermId != lis[i].getTermId()) + else if (targetTermId != currentEntries[i].getValue().getTermId()) { //check the termids match for this term - logger.error("Term "+targetTerm+" had two termids ("+targetTermId+","+lis[i].getTermId()+")"); + logger.error("Term "+targetTerm+" had two termids ("+targetTermId+","+currentEntries[i].getValue().getTermId()+")"); } //logger.debug("Term "+targetTerm + " found in "+i + "termid="+ lis[i].getTermId()); - Tf += lis[i].getTF(); - Nt += lis[i].getNt(); - hasMore[i] = lis[i].readNextEntry(); - if (hasMore[i] != -1) + nextEntryToWrite.add(currentEntries[i].getValue()); + hasMore[i] = lis[i].hasNext(); + + if (hasMore[i]) + { + currentEntries[i] = lis[i].next(); + terms.add(currentEntries[i].getKey()); + } + else { - terms.add(lis[i].getTerm()); - //break; + currentEntries[i] = null; } break; } @@ -499,19 +519,18 @@ logger.error("Term "+ targetTerm + " not found in any lexicons"); } //end of this term, so we can write the lexicon entry - totalTokens += Tf; - totalPointers += Nt; - los.writeNextEntry(targetTerm, targetTermId, Nt, Tf, 0, (byte)0); - Tf = Nt = 0; targetTermId = -1; targetTerm = null; + los.writeNextEntry(targetTerm, nextEntryToWrite); + nextEntryToWrite = null; targetTermId = -1; targetTerm = null; } } - totalTokens += Tf; - totalPointers += Nt; if (targetTermId != -1) - los.writeNextEntry(targetTerm, targetTermId, Nt, Tf, 0, (byte)0); + los.writeNextEntry(targetTerm, nextEntryToWrite); los.close(); for(int i=0;i> lis1, + Iterator> lis2, + LexiconOutputStream los) throws IOException { //We always take the first two entries of @@ -538,21 +557,22 @@ int termID1 = 0; int termID2 = 0; - long totalTokens = 0; - long totalPointers = 0; - - hasMore1 = (lis1.readNextEntry()!=-1); - hasMore2 = (lis2.readNextEntry()!=-1); + hasMore1 = lis1.hasNext(); + hasMore2 = lis2.hasNext(); String sTerm1 = null; String sTerm2 = null; + Map.Entry lee1 = null; + Map.Entry lee2 = null; if (hasMore1) { - termID1 = lis1.getTermId(); - sTerm1 = lis1.getTerm(); + lee1 = lis1.next(); + termID1 = lee1.getValue().getTermId(); + sTerm1 = lee1.getKey(); } if (hasMore2) { - termID2 = lis2.getTermId(); - sTerm2 = lis2.getTerm(); + lee2 = lis2.next(); + termID2 = lee2.getValue().getTermId(); + sTerm2 = lee2.getKey(); } while (hasMore1 && hasMore2) { int compareString = 0; @@ -567,341 +587,137 @@ } if (compareString <0) { - totalTokens += lis1.getTF(); - totalPointers += lis1.getNt(); - los.writeNextEntry(sTerm1, termID1, lis1.getNt(), lis1.getTF(), lis1.getEndOffset(), lis1.getEndBitOffset()); - hasMore1 = (lis1.readNextEntry()!=-1); + los.writeNextEntry(sTerm1, lee1.getValue()); + hasMore1 = lis1.hasNext(); if (hasMore1) { - termID1 = lis1.getTermId(); - sTerm1 = lis1.getTerm(); + lee1 = lis1.next(); + termID1 = lee1.getValue().getTermId(); + sTerm1 = lee1.getKey(); } } else if (compareString >0) { - totalTokens += lis2.getTF(); - totalPointers += lis2.getNt(); - los.writeNextEntry(sTerm2, termID2, lis2.getNt(), lis2.getTF(), lis2.getEndOffset(), lis2.getEndBitOffset()); - hasMore2 = (lis2.readNextEntry()!=-1); + los.writeNextEntry(sTerm2, lee2.getValue()); + hasMore2 = lis2.hasNext(); if (hasMore2) { - termID2 = lis2.getTermId(); - sTerm2 = lis2.getTerm(); + lee2 = lis2.next(); + termID2 = lee2.getValue().getTermId(); + sTerm2 = lee2.getKey(); } } else /*if (compareString == 0)*/ { - totalTokens += lis1.getTF() + lis2.getTF(); - totalPointers += lis1.getNt() + lis2.getNt(); + lee1.getValue().add(lee2.getValue()); los.writeNextEntry( sTerm1, - termID1, - lis1.getNt() + lis2.getNt(), - lis1.getTF() + lis2.getTF(), - 0, //inverted index not built yet, so no offsets - (byte)0 //inverted index not built yet, so no offsets + lee1.getValue() ); - - hasMore1 = (lis1.readNextEntry()!=-1); - hasMore2 = (lis2.readNextEntry()!=-1); + hasMore1 = lis1.hasNext(); + hasMore2 = lis2.hasNext(); if (hasMore1) { - termID1 = lis1.getTermId(); - sTerm1 = lis1.getTerm(); + lee1 = lis1.next(); + termID1 = lee1.getValue().getTermId(); + sTerm1 = lee1.getKey(); } if (hasMore2) { - termID2 = lis2.getTermId(); - sTerm2 = lis2.getTerm(); + lee2 = lis2.next(); + termID2 = lee2.getValue().getTermId(); + sTerm2 = lee2.getKey(); } } } if (hasMore1) { - lis2.close(); + if (lis2 instanceof Closeable) { + ((Closeable)lis2).close(); + } while (hasMore1) { - totalTokens += lis1.getTF(); - totalPointers += lis1.getNt(); - los.writeNextEntry(sTerm1, termID1, lis1.getNt(), lis1.getTF(), lis1.getEndOffset(), lis1.getEndBitOffset()); - hasMore1 = (lis1.readNextEntry()!=-1); + los.writeNextEntry(sTerm1, lee1.getValue()); + hasMore1 = lis1.hasNext(); if (hasMore1) { - termID1 = lis1.getTermId(); - sTerm1 = lis1.getTerm(); + lee1 = lis1.next(); + termID1 = lee1.getValue().getTermId(); + sTerm1 = lee1.getKey(); } } //close input file 1 stream - lis1.close(); + if (lis2 instanceof Closeable) { + ((Closeable)lis2).close(); + } } else if (hasMore2) { - lis1.close(); + if (lis1 instanceof Closeable) { + ((Closeable)lis1).close(); + } while (hasMore2) { - totalTokens += lis2.getTF(); - totalPointers += lis2.getNt(); - los.writeNextEntry(sTerm2, termID2, lis2.getNt(), lis2.getTF(), lis2.getEndOffset(), lis2.getEndBitOffset()); - hasMore2 = (lis2.readNextEntry()!=-1); + los.writeNextEntry(sTerm2, lee2.getValue()); + hasMore2 = lis2.hasNext(); if (hasMore2) { - termID2 = lis2.getTermId(); - sTerm2 = lis2.getTerm(); + lee2 = lis2.next(); + termID2 = lee2.getValue().getTermId(); + sTerm2 = lee2.getKey(); } } //close input file 2 stream - lis2.close(); + if (lis2 instanceof Closeable) { + ((Closeable)lis2).close(); + } } //close output file streams los.close(); } - /** - * Creates the lexicon index file that contains a mapping from the - * given term id to the offset in the lexicon, in order to - * be able to retrieve the term information according to the - * term identifier. This is necessary, because the terms in the lexicon - * file are saved in lexicographical order, and we also want to have - * fast access based on their term identifier. - * @param lexicon The input stream of the lexicon that we are creating the lexid file for - * @param lexiconEntries The number of entries in this lexicon - * @param lexiconEntrySize The size of one entry in this lexicon - * @exception java.io.IOException Throws an Input/Output exception if - * there is an input/output error. - */ - public void createLexiconIndex(final LexiconInputStream lexicon, - final int lexiconEntries, - final int lexiconEntrySize) throws IOException { - createLexiconIndex(lexicon, lexiconEntries,lexiconEntrySize, indexPath, indexPrefix); - } - /** - * Creates the lexicon index file that contains a mapping from the - * given term id to the offset in the lexicon, in order to - * be able to retrieve the term information according to the - * term identifier. This is necessary, because the terms in the lexicon - * file are saved in lexicographical order, and we also want to have - * fast access based on their term identifier. - * @param lexicon The input stream of the lexicon that we are creating the lexid file for - * @param lexiconEntries The number of entries in this lexicon - * @param lexiconEntrySize The size of one entry in this lexicon - * @param path The path to the index containing the lexicon - * @param prefix The prefix of the index containing the lexicon - * @exception java.io.IOException Throws an Input/Output exception if - * there is an input/output error. - */ - public static void createLexiconIndex(final LexiconInputStream lexicon, - final int lexiconEntries, final int lexiconEntrySize, - final String path, final String prefix) throws IOException - { - //save the offsets to a file with the same name as - //the lexicon and extension .lexid - String lexid = path + - ApplicationSetup.FILE_SEPARATOR + - prefix + - ApplicationSetup.LEXICON_INDEX_SUFFIX; - DataOutputStream dosLexid = new DataOutputStream(Files.writeFileStream(lexid)); - createLexiconIndex(lexicon, lexiconEntries, lexiconEntrySize, dosLexid); - } - - public static void createLexiconIndex(final LexiconInputStream lexicon, - final int lexiconEntries, final int lexiconEntrySize, - final DataOutputStream dosLexid) throws IOException - { - - /* - * This method reads from the lexicon the term ids and stores the - * corresponding offsets in an array. Then this array is written out - * in order according to the term id. - */ - long totalPointers = 0; - long totalTokens = 0; - - - //the i-th element of offsets contains the offset in the - //lexicon file of the term with term identifier equal to i. - long[] offsets = new long[lexiconEntries]; - int termid = -1; - int i=0; - try{ - while (lexicon.readNextEntry()!=-1) { - termid = lexicon.getTermId(); - totalPointers += lexicon.getNt(); - totalTokens += lexicon.getTF(); - //Debugging: if an exception occurs here, then this infers that the number of entries in the lexicon - //has been calculated incorrectly, or that termId > lexiconEntries. termid > lexiconEntries could be - //a sign that the lexicon is being decoded incorrecty - eg you're using LexiconInputStream instead of - //UTFLexiconInputStream - offsets[termid] = (long)i * (long)lexiconEntrySize; - i++; - } - } catch (ArrayIndexOutOfBoundsException aioob) { - logger.error("Termid overflow while creating lexid file: NumEntries="+lexiconEntries+ " entrySize=" - +lexiconEntrySize+ " termid="+termid, aioob); - } - lexicon.close(); - //write out the offsets - for (i = 0; i < lexiconEntries; i++) { - dosLexid.writeLong(offsets[i]); - } - dosLexid.close(); - } - /** Creates a lexicon index for the specified index * @param index Index to make the lexicon index for - */ + * @deprecated use optimise instead + */ public static void createLexiconIndex(Index index) throws IOException { - final LexiconInputStream lis = (LexiconInputStream)index.getIndexStructureInputStream("lexicon"); - LexiconBuilder.createLexiconIndex( - lis, - index.getCollectionStatistics().getNumberOfUniqueTerms(), - lis.getEntrySize(), - index.getPath(), - index.getPrefix()); + optimise(index, "lexicon"); } - /** Create a lexicon hash for the current index - * @param lexStream lexiconinputstream to process - */ - public void createLexiconHash(final LexiconInputStream lexStream) { - LexiconBuilder.createLexiconHash(lexStream, indexPath, indexPrefix); - } /** Creates a lexicon hash for the specified index - * @param index Index to make the LexiconHash for + * @param index Index to make the LexiconHash the lexicoin + * @deprecated use optimise instead */ public static void createLexiconHash(final Index index) throws IOException { - LexiconBuilder.createLexiconHash((LexiconInputStream)index.getIndexStructureInputStream("lexicon"), - index.getPath(),index.getPrefix()); - } - - /** - * Creates a Lexicon hash. This method reads the lexicon and finds the entries which - * start with a different letter. The offset of these entries - * is used to speed up the binary search performed during retrieval. - * These offsets are saved to a lex hash file beside the Lexicon in the Index. - * @param lexStream LexiconInputStream to process - * @param path Path to the index containing the lexicon - * @param prefix Prefix of the index containing the lexicon - */ - public static void createLexiconHash(final LexiconInputStream lexStream, final String path, final String prefix) { - String filename = path + ApplicationSetup.FILE_SEPARATOR + prefix + ApplicationSetup.LEXICON_HASH_SUFFIX; - try{ - createLexiconHash(lexStream, Files.writeFileStream(filename)); - } catch(IOException ioe) { - logger.error("IOException while creating hash file in LexiconBuilder.createLexiconHash: " + ioe); - } + optimise(index, "lexicon"); } - public static void createLexiconHash(final LexiconInputStream lexStream, OutputStream out) - { - TIntObjectHashMap map = new TIntObjectHashMap(); - int previousFirstChar = -1; - int firstChar = 0; - int counter = -1; - - try { - //read all the terms in the lexicon and - //mark the offset of the ones that start - //with a different character from the - //previous entry. - while (lexStream.readNextEntry()!=-1) { - firstChar = lexStream.getTerm().charAt(0); - if (firstChar!=previousFirstChar) { - int[] boundaries = new int[] {counter, 0}; - map.put(firstChar, boundaries); - previousFirstChar = firstChar; - } - counter++; - } - lexStream.close(); - - - //NB: map should not be too large, say 26+10, more if UTF characters - - // after reading all the entries, update the upper - // boundary, which is zero from the previous step. - int[] mapKeys = map.keys(); - Arrays.sort(mapKeys); - final int mapKeysSize = mapKeys.length; - for (int i=0; i counter = new BasicLexiconCollectionStaticticsCounter(index); + MapFileLexicon.optimise(structureName, index, counter); + counter.close(); + } catch(IOException ioe) { + logger.error("IOException while creating optimising lexicon called " + structureName, ioe); } } /** return the lexicon input stream for the current index at the specified filename */ - protected LexiconInputStream getLexInputStream(String filename) + @SuppressWarnings("unchecked") + protected Iterator> getLexInputStream(String structureName) throws IOException { - LexiconInputStream li = null; - try{ - li = (LexiconInputStream) lexiconInputStream.getConstructor(String.class).newInstance(filename); - } catch (Exception e) { - logger.error("Problem loading a LexiconInputStream", e); - } - return li; + return new MapFileLexicon.MapFileLexiconIterator(structureName, index.getPath(), index.getPrefix(), + (FixedSizeWriteableFactory)index.getIndexStructure(defaultStructureName+"-keyfactory"), + (FixedSizeWriteableFactory)index.getIndexStructure(defaultStructureName+"-valuefactory")); } /** return the lexicon outputstream or the current index at the specified filename */ - protected LexiconOutputStream getLexOutputStream(String filename) + @SuppressWarnings("unchecked") + protected LexiconOutputStream getLexOutputStream(String structureName) throws IOException { - LexiconOutputStream lo = null; - try{ - lo = (LexiconOutputStream) lexiconOutputStream.getConstructor(String.class).newInstance(filename); - } catch (Exception e) { - logger.error("Problem loading a LexiconOutputStream", e); - } - return lo; + return new MapFileLexiconOutputStream( + index.getPath(), index.getPrefix(), + structureName, + (FixedSizeWriteableFactory)index.getIndexStructure(defaultStructureName+"-keyfactory")); } } diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/LexiconMap.java src/uk/ac/gla/terrier/structures/indexing/LexiconMap.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/LexiconMap.java 2009-01-28 20:16:58.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/indexing/LexiconMap.java 2009-03-03 14:34:49.000000000 +0000 @@ -32,6 +32,7 @@ import java.io.IOException; import java.util.Arrays; +import uk.ac.gla.terrier.structures.BasicLexiconEntry; import uk.ac.gla.terrier.structures.LexiconOutputStream; import uk.ac.gla.terrier.utility.ApplicationSetup; import uk.ac.gla.terrier.utility.TermCodes; @@ -96,14 +97,16 @@ * The binary tree is traversed in order, by called the method * traverseAndStoreToStream. * @param lexiconStream The lexicon output stream to store to. */ - public void storeToStream(LexiconOutputStream lexiconStream) throws IOException { - final byte zerob = (byte)0; - final long zerol = (long)0; + public void storeToStream(LexiconOutputStream lexiconStream) throws IOException + { final String[] terms = tfs.keys(new String[0]); Arrays.sort(terms); + BasicLexiconEntry le = new BasicLexiconEntry();//TODO could use the one without positions for (String t : terms) { - lexiconStream.writeNextEntry(t, TermCodes.getCode(t), nts.get(t), tfs.get(t), zerol, zerob); + le.setTermId(TermCodes.getCode(t)); + le.setStatistics(nts.get(t), tfs.get(t)); + lexiconStream.writeNextEntry(t, le); } } diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/TermEstimateIndex.java src/uk/ac/gla/terrier/structures/indexing/TermEstimateIndex.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/TermEstimateIndex.java 2009-01-28 20:16:58.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/indexing/TermEstimateIndex.java 2009-03-03 14:34:49.000000000 +0000 @@ -25,15 +25,17 @@ */ package uk.ac.gla.terrier.structures.indexing; import java.io.DataInputStream; -import java.io.File; import java.io.IOException; +import java.util.Iterator; +import java.util.Map; import org.apache.log4j.Logger; +import uk.ac.gla.terrier.structures.Closeable; import uk.ac.gla.terrier.structures.CollectionStatistics; import uk.ac.gla.terrier.structures.Index; import uk.ac.gla.terrier.structures.Lexicon; -import uk.ac.gla.terrier.structures.LexiconInputStream; +import uk.ac.gla.terrier.structures.LexiconEntry; import uk.ac.gla.terrier.utility.ApplicationSetup; import uk.ac.gla.terrier.utility.Files; /** @@ -42,20 +44,16 @@ */ public class TermEstimateIndex { private static Logger logger = Logger.getRootLogger(); - protected final Lexicon lex; + protected final Lexicon lex; protected final int numTerms; /** The array of term estimate for each term. It is sorted by termid. */ protected double[] termEstimate; /** The filename of the term estimate index on disk. */ protected String INDEX_FILENAME; - /** - * The default constructor. - */ - public TermEstimateIndex() { - this( Index.createIndex() ); - } - public TermEstimateIndex(Index index) + + @SuppressWarnings("unchecked") + public TermEstimateIndex(Index index) throws IOException { final String path = index.getPath(); final String prefix = index.getPrefix(); @@ -71,34 +69,29 @@ //always use a lexiconinputstream, as blocklexicons dont exist past invertedindex creation //but check if we're using UTF - final LexiconInputStream lexin = (LexiconInputStream)index.getIndexStructureInputStream("lexicon"); - - - for (int i = 0; i < termids.length; i++){ - try{ - lexin.readNextEntry(); - termids[i] = lexin.getTermId(); - } - catch(IOException ioe){ - logger.error("Problem reading lexicon input stream while loading TermEstimateIndex"); - - } + final Iterator> lexin = + (Iterator>)index.getIndexStructureInputStream("lexicon"); + + int i=0; + while(lexin.hasNext()) + { + termids[i++] = lexin.next().getValue().getTermId(); } - lexin.close(); + if (lexin instanceof Closeable) + ((Closeable)lexin).close(); if (Files.exists(INDEX_FILENAME)){ - try{ - DataInputStream in = new DataInputStream( - Files.openFileStream(INDEX_FILENAME)); - for (int i = 0; i < collectionStatistics.getNumberOfUniqueTerms(); i++){ - this.termEstimate[termids[i]] = in.readDouble(); - } - in.close(); - } - catch(IOException ioe){ - logger.error("Problem reading TermEstimateIndex at "+INDEX_FILENAME, ioe); - } + logger.error("Could not load TermEstimate index"); + return; + } + + DataInputStream in = new DataInputStream( + Files.openFileStream(INDEX_FILENAME)); + final int termCount = collectionStatistics.getNumberOfUniqueTerms(); + for (i = 0; i < termCount; i++){ + this.termEstimate[termids[i]] = in.readDouble(); } + in.close(); } /** * This method prints all the entries in the term estimate index. @@ -110,9 +103,9 @@ Files.openFileStream(INDEX_FILENAME)); for (int i = 0; i < numTerms; i++){ double te = in.readDouble(); - lex.seekEntry(i); + Map.Entry lee = lex.getIthLexiconEntry(i); if(logger.isDebugEnabled()){ - logger.debug(lex.getTerm() + ": " + te); + logger.debug(lee.getKey() + ": " + te); } } in.close(); diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/UTFBlockInvertedIndexBuilder.java src/uk/ac/gla/terrier/structures/indexing/UTFBlockInvertedIndexBuilder.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/UTFBlockInvertedIndexBuilder.java 2009-01-28 20:16:58.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/indexing/UTFBlockInvertedIndexBuilder.java 1970-01-01 01:00:00.000000000 +0100 @@ -1,83 +0,0 @@ -/* - * Terrier - Terabyte Retriever - * Webpage: http://ir.dcs.gla.ac.uk/terrier - * Contact: terrier{a.}dcs.gla.ac.uk - * University of Glasgow - Department of Computing Science - * http://www.gla.ac.uk/ - * - * The contents of this file are subject to the Mozilla Public License - * Version 1.1 (the "License"); you may not use this file except in - * compliance with the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. - * - * The Original Code is UTFBlockInvertedIndexBuilder.java. - * - * The Original Code is Copyright (C) 2004-2009 the University of Glasgow. - * All Rights Reserved. - * - * Contributor(s): - * Douglas Johnson (original author) - * Vassilis Plachouras - * Craig Macdonald - */ -package uk.ac.gla.terrier.structures.indexing; -import uk.ac.gla.terrier.structures.Index; -import uk.ac.gla.terrier.structures.UTFBlockLexiconInputStream; -import uk.ac.gla.terrier.structures.UTFLexiconOutputStream; -/** - * Builds an inverted index using block information, where indexing lexicon is a UTFBlock lexicon. It is optional to - * save field information as well. - * @author Douglas Johnson & Vassilis Plachouras & Craig Macdonald - * @version $Revision: 1.12 $ - */ -public class UTFBlockInvertedIndexBuilder extends BlockInvertedIndexBuilder { - - public UTFBlockInvertedIndexBuilder(Index i) - { - super(i); - lexiconInputStream = UTFBlockLexiconInputStream.class; - lexiconOutputStream = UTFLexiconOutputStream.class; - finalLexiconClass = "uk.ac.gla.terrier.structures.UTFLexicon"; - finalLexiconInputStreamClass = "uk.ac.gla.terrier.structures.UTFLexiconInputStream"; - } - - /** - * Creates an instance of the BlockInvertedIndex class. - * @deprecated - */ - public UTFBlockInvertedIndexBuilder() { - super(); - lexiconInputStream = UTFBlockLexiconInputStream.class; - lexiconOutputStream = UTFLexiconOutputStream.class; - finalLexiconClass = "uk.ac.gla.terrier.structures.UTFLexicon"; - finalLexiconInputStreamClass = "uk.ac.gla.terrier.structures.UTFLexiconInputStream"; - } - /** - * Creates an instance of the BlockInvertedIndex class - * using the given filename. - * @param filename the name of the inverted file - * @deprecated - */ - public UTFBlockInvertedIndexBuilder(String filename) { - super(filename); - lexiconInputStream = UTFBlockLexiconInputStream.class; - lexiconOutputStream = UTFLexiconOutputStream.class; - finalLexiconClass = "uk.ac.gla.terrier.structures.UTFLexicon"; - finalLexiconInputStreamClass = "uk.ac.gla.terrier.structures.UTFLexiconInputStream"; - } - - /** - @deprecated */ - public UTFBlockInvertedIndexBuilder(String path, String prefix) { - super(path, prefix); - lexiconInputStream = UTFBlockLexiconInputStream.class; - lexiconOutputStream = UTFLexiconOutputStream.class; - finalLexiconClass = "uk.ac.gla.terrier.structures.UTFLexicon"; - finalLexiconInputStreamClass = "uk.ac.gla.terrier.structures.UTFLexiconInputStream"; - } -} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/UTFBlockLexiconBuilder.java src/uk/ac/gla/terrier/structures/indexing/UTFBlockLexiconBuilder.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/UTFBlockLexiconBuilder.java 2009-01-28 20:16:58.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/indexing/UTFBlockLexiconBuilder.java 1970-01-01 01:00:00.000000000 +0100 @@ -1,135 +0,0 @@ -/* - * Terrier - Terabyte Retriever - * Webpage: http://ir.dcs.gla.ac.uk/terrier - * Contact: terrier{a.}dcs.gla.ac.uk - * University of Glasgow - Department of Computing Science - * http://www.gla.ac.uk/ - * - * The contents of this file are subject to the Mozilla Public License - * Version 1.1 (the "License"); you may not use this file except in - * compliance with the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. - * - * The Original Code is UTFBlockLexiconBuilder.java. - * - * The Original Code is Copyright (C) 2004-2009 the University of Glasgow. - * All Rights Reserved. - * - * Contributor(s): - * Douglas Johnson (original author) - * Vassilis Plachouras - * Craig Macdonald - */ -package uk.ac.gla.terrier.structures.indexing; -import java.io.IOException; - -import org.apache.log4j.Logger; - -import uk.ac.gla.terrier.structures.Index; -import uk.ac.gla.terrier.structures.LexiconInputStream; -import uk.ac.gla.terrier.structures.UTFBlockLexiconInputStream; -import uk.ac.gla.terrier.structures.UTFBlockLexiconOutputStream; -import uk.ac.gla.terrier.structures.UTFLexicon; -import uk.ac.gla.terrier.utility.ApplicationSetup; -/** - * Builds a block lexicon using block frequencies. - * @author Douglas Johnsonm, Vassilis Plachouras & Craig Macdonald - * @version $Revision: 1.17 $ - */ -public class UTFBlockLexiconBuilder extends BlockLexiconBuilder -{ - protected static Logger logger = Logger.getRootLogger(); - /** - * A default constructor of the class. The lexicon is built in the - * default path and file: ApplicationSetup.TERRIER_INDEX_PATH and - * ApplicationSetup.TERRIER_INDEX_PREFIX respectively. - */ - public UTFBlockLexiconBuilder() { - super(); - lexiconOutputStream = UTFBlockLexiconOutputStream.class; - lexiconInputStream = UTFBlockLexiconInputStream.class; - LexiconMapClass = BlockLexiconMap.class; - try{ TempLex = (LexiconMap) LexiconMapClass.newInstance(); } catch (Exception e) {logger.error(e);} - } - - public UTFBlockLexiconBuilder(Index i) - { - super(i); - lexiconOutputStream = UTFBlockLexiconOutputStream.class; - lexiconInputStream = UTFBlockLexiconInputStream.class; - LexiconMapClass = BlockLexiconMap.class; - try{ TempLex = (LexiconMap) LexiconMapClass.newInstance(); } catch (Exception e) {logger.error(e);} - } - - /** - * A default constructor which is given a pathname in which - * the temporary lexicons will be stored. - * @param pathname String the name of the path in which the temporary - * and final lexicons will be stored. - * @param prefix String the file component of the lexicons - */ - public UTFBlockLexiconBuilder(String pathname, String prefix) { - super(pathname, prefix); - lexiconOutputStream = UTFBlockLexiconOutputStream.class; - lexiconInputStream = UTFBlockLexiconInputStream.class; - LexiconMapClass = BlockLexiconMap.class; - try{ TempLex = (LexiconMap) LexiconMapClass.newInstance(); } catch (Exception e) {logger.error(e);} - } - - /** - * The method that performs processing of the lexicon after the - * creation of the direct index has been completed. It flushes to - * disk the current temporary lexicon, and it starts the merging - * of the temporary lexicons and the creation of the lexicon index. - */ - public void finishedDirectIndexBuild() - { - if(logger.isInfoEnabled()){ - logger.info("flushing utf block lexicon to disk after the direct index completed"); - } - //only write a temporary lexicon if there are any items in it - if (TempLex.getNumberOfNodes() > 0) - writeTemporaryLexicon(); - - //merges the temporary lexicons - if (tempLexFiles.size() > 0) - { - try{ - merge(tempLexFiles); - - //creates the offsets file - final String lexiconFilename = - indexPath + ApplicationSetup.FILE_SEPARATOR + - indexPrefix + ApplicationSetup.LEXICONSUFFIX; - LexiconInputStream lis = getLexInputStream(lexiconFilename); - createLexiconIndex( - lis, - lis.numberOfEntries(), - /* after inverted index is built, the lexicon will be transformed into a - * normal lexicon, without block frequency */ - UTFLexicon.lexiconEntryLength - ); - TermCount = lis.numberOfEntries(); - if (index != null) - { - index.addIndexStructure("lexicon", "uk.ac.gla.terrier.structures.UTFBlockLexicon"); - index.addIndexStructureInputStream("lexicon", "uk.ac.gla.terrier.structures.UTFBlockLexiconInputStream"); - index.setIndexProperty("num.Terms", ""+lis.numberOfEntries()); - index.setIndexProperty("num.Pointers", ""+lis.getNumberOfPointersRead()); - } - } catch(IOException ioe){ - logger.error("Indexing failed to write a lexicon index file to disk", ioe); - } - } - else - logger.warn("No temporary lexicons to merge, skipping"); - - } - - -} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/UTFInvertedIndexBuilder.java src/uk/ac/gla/terrier/structures/indexing/UTFInvertedIndexBuilder.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/UTFInvertedIndexBuilder.java 2009-01-28 20:16:58.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/indexing/UTFInvertedIndexBuilder.java 1970-01-01 01:00:00.000000000 +0100 @@ -1,77 +0,0 @@ -/* - * Terrier - Terabyte Retriever - * Webpage: http://ir.dcs.gla.ac.uk/terrier - * Contact: terrier{a.}dcs.gla.ac.uk - * University of Glasgow - Department of Computing Science - * http://www.gla.ac.uk/ - * - * The contents of this file are subject to the Mozilla Public License - * Version 1.1 (the "License"); you may not use this file except in - * compliance with the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. - * - * The Original Code is UTFInvertedIndexBuilder.java. - * - * The Original Code is Copyright (C) 2004-2009 the University of Glasgow. - * All Rights Reserved. - * - * Contributor(s): - * Craig Macdonald (original author) - * Vassilis Plachouras - */ -package uk.ac.gla.terrier.structures.indexing; - -import uk.ac.gla.terrier.structures.Index; -import uk.ac.gla.terrier.structures.UTFLexiconInputStream; -import uk.ac.gla.terrier.structures.UTFLexiconOutputStream; - -/** - * Builds a UTF inverted index, using field information optionally. - * @author Craig Macdonald & Vassilis Plachouras - * @version $Revision: 1.14 $ - */ -public class UTFInvertedIndexBuilder extends InvertedIndexBuilder { - - public UTFInvertedIndexBuilder(Index i) - { - super(i); - lexiconInputStream = UTFLexiconInputStream.class; - lexiconOutputStream = UTFLexiconOutputStream.class; - } - - /** - * A default constructor of the class InvertedIndex. - * @deprecated - */ - public UTFInvertedIndexBuilder() { - super(); - lexiconInputStream = UTFLexiconInputStream.class; - lexiconOutputStream = UTFLexiconOutputStream.class; - } - - /** @deprecated */ - public UTFInvertedIndexBuilder(String path, String prefix) - { - super(path, prefix); - lexiconInputStream = UTFLexiconInputStream.class; - lexiconOutputStream = UTFLexiconOutputStream.class; - } - - /** - * Creates an instance of the InvertedIndex - * class using the given filename. - * @param filename The name of the inverted file - * @deprecated - */ - public UTFInvertedIndexBuilder(String filename) { - super(filename); - lexiconInputStream = UTFLexiconInputStream.class; - lexiconOutputStream = UTFLexiconOutputStream.class; - } - -} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/UTFLexiconBuilder.java src/uk/ac/gla/terrier/structures/indexing/UTFLexiconBuilder.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/UTFLexiconBuilder.java 2009-01-28 20:16:58.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/indexing/UTFLexiconBuilder.java 1970-01-01 01:00:00.000000000 +0100 @@ -1,136 +0,0 @@ -/* - * Terrier - Terabyte Retriever - * Webpage: http://ir.dcs.gla.ac.uk/terrier - * Contact: terrier{a.}dcs.gla.ac.uk - * University of Glasgow - Department of Computing Science - * http://www.gla.ac.uk/ - * - * The contents of this file are subject to the Mozilla Public License - * Version 1.1 (the "License"); you may not use this file except in - * compliance with the License. You may obtain a copy of the License at - * http://www.mozilla.org/MPL/ - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. - * - * The Original Code is UTFLexiconBuilder.java. - * - * The Original Code is Copyright (C) 2004-2009 the University of Glasgow. - * All Rights Reserved. - * - * Contributor(s): - * Craig Macdonald (original author) - * Vassilis Plachouras - */ -package uk.ac.gla.terrier.structures.indexing; - -import java.io.IOException; - -import org.apache.log4j.Logger; - -import uk.ac.gla.terrier.structures.Index; -import uk.ac.gla.terrier.structures.LexiconInputStream; -import uk.ac.gla.terrier.structures.UTFLexicon; -import uk.ac.gla.terrier.structures.UTFLexiconInputStream; -import uk.ac.gla.terrier.structures.UTFLexiconOutputStream; -import uk.ac.gla.terrier.utility.ApplicationSetup; -/** - * Builds temporary lexicons during indexing a collection and - * merges them when the indexing of a collection has finished. - * @author Craig Macdonald & Vassilis Plachouras - * @version $Revision: 1.16 $ - */ -public class UTFLexiconBuilder extends LexiconBuilder -{ - private static Logger logger = Logger.getRootLogger(); - /** - * A default constructor of the class. The lexicon is built in the - * default path and file: ApplicationSetup.TERRIER_INDEX_PATH and - * ApplicationSetup.TERRIER_INDEX_PREFIX respectively. - * @deprecated - */ - public UTFLexiconBuilder() { - super(); - lexiconOutputStream = UTFLexiconOutputStream.class; - lexiconInputStream = UTFLexiconInputStream.class; - } - - /** - * Creates an instance of the class, given the path - * to save the temporary lexicons. - * @param pathname String the path to save the temporary and final lexicons. - * @param prefix String the filename component of the lexicons - */ - public UTFLexiconBuilder(String pathname, String prefix) { - super(pathname, prefix); - lexiconOutputStream = UTFLexiconOutputStream.class; - lexiconInputStream = UTFLexiconInputStream.class; - } - - public UTFLexiconBuilder(Index i) - { - super(i); - lexiconOutputStream = UTFLexiconOutputStream.class; - lexiconInputStream = UTFLexiconInputStream.class; - } - - - /** - * Processing the lexicon after finished creating the - * direct and document indexes. - */ - public void finishedDirectIndexBuild() - { - if(logger.isInfoEnabled()){ - logger.info("flushing lexicon to disk after the direct index completed"); - } - //only write a temporary lexicon if there are any items in it - if (TempLex.getNumberOfNodes() > 0) - writeTemporaryLexicon(); - TempLex = null; - - //merges the temporary lexicons - if (tempLexFiles.size() > 0) - { - try{ - merge(tempLexFiles); - - //creates the offsets file - final String lexiconFilename = - indexPath + ApplicationSetup.FILE_SEPARATOR + - indexPrefix + ApplicationSetup.LEXICONSUFFIX; - LexiconInputStream lis = getLexInputStream(lexiconFilename); - createLexiconIndex( - lis, - lis.numberOfEntries(), - UTFLexicon.lexiconEntryLength - ); - TermCount = lis.numberOfEntries(); - if (index != null) - { - index.addIndexStructure("lexicon", "uk.ac.gla.terrier.structures.UTFLexicon"); - index.addIndexStructureInputStream("lexicon", "uk.ac.gla.terrier.structures.UTFLexiconInputStream"); - index.setIndexProperty("num.Terms", ""+lis.numberOfEntries()); - index.setIndexProperty("num.Pointers", ""+lis.getNumberOfPointersRead()); - } - - } catch (IOException ioe) { - logger.error("Indexing failed to write a lexicon index file to disk", ioe); - } - } - else - logger.warn("No temporary lexicons to merge, skipping"); - } - - @Override - public void finishedInvertedIndexBuild() { - super.finishedInvertedIndexBuild(); - if (index != null) - { - index.addIndexStructure("lexicon", "uk.ac.gla.terrier.structures.UTFLexicon"); - } - } - -} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/singlepass/RunsMerger.java src/uk/ac/gla/terrier/structures/indexing/singlepass/RunsMerger.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/singlepass/RunsMerger.java 2009-01-28 20:16:59.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/indexing/singlepass/RunsMerger.java 2009-03-03 14:34:49.000000000 +0000 @@ -35,6 +35,9 @@ import uk.ac.gla.terrier.compression.BitOut; import uk.ac.gla.terrier.compression.BitOutputStream; +import uk.ac.gla.terrier.structures.BasicLexiconEntry; +import uk.ac.gla.terrier.structures.BitFilePosition; +import uk.ac.gla.terrier.structures.FilePosition; import uk.ac.gla.terrier.structures.LexiconOutputStream; /** @@ -86,6 +89,8 @@ protected int currentTerm = 0; /** Number of pointers written */ protected int numberOfPointers = 0; + + protected BitFilePosition startOffset = new FilePosition(0l,(byte)0); protected RunIteratorFactory runsSource; @@ -134,14 +139,16 @@ * @return the byte offset in the BitOut (used for lexicon writting) */ public long getByteOffset(){ - return bos.getBitOffset() == 0? bos.getByteOffset() - 1: bos.getByteOffset(); + return bos.getByteOffset(); + //return bos.getBitOffset() == 0? bos.getByteOffset() - 1: bos.getByteOffset(); } /** * @return the bit offset in the BitOut (used for lexicon writting) */ - public int getBitOffset(){ - return bos.getBitOffset() == 0? 7: bos.getBitOffset() - 1; + public byte getBitOffset(){ + return bos.getBitOffset(); + //return bos.getBitOffset() == 0 ? (byte)7 : bos.getBitOffset() - (byte)1; } /** @@ -207,21 +214,26 @@ * @param lexStream LexiconOutputStream used to write the lexicon. * @throws IOException if an I/O error occurs. */ - public void mergeOne(LexiconOutputStream lexStream) throws Exception{ + public void mergeOne(LexiconOutputStream lexStream) throws Exception{ myRun = queue.poll(); if(myRun.current().getTerm().equals(lastTermWritten)){ // append the term --> keep the data in memory lastDocument = myRun.current().append(bos, lastDocument); lastFreq += myRun.current().getTF(); lastDocFreq += myRun.current().getDf(); + }else{ - lexStream.writeNextEntry(lastTermWritten, currentTerm++, lastDocFreq, lastFreq, this.getByteOffset(), (byte)this.getBitOffset()); - // write the new term + //write this term to the lexicon + lexStream.writeNextEntry(lastTermWritten, new BasicLexiconEntry(currentTerm++, lastDocFreq, lastFreq, startOffset)); + //record the start offset of the next term + startOffset.setPosition(this.getByteOffset(), this.getBitOffset()); + //get the information of the next term from the Run numberOfPointers += lastDocFreq; lastDocument = myRun.current().append(bos,-1); lastFreq = myRun.current().getTF(); lastDocFreq = myRun.current().getDf(); lastTermWritten = myRun.current().getTerm(); + } if(myRun.hasNext()){ myRun.next(); @@ -236,8 +248,9 @@ * @param lexStream LexiconOutputStream used to write the lexicon. * @throws IOException if an I/O error occurs. */ - public void endMerge(LexiconOutputStream lexStream) throws IOException{ - lexStream.writeNextEntry(lastTermWritten, currentTerm++, lastDocFreq, lastFreq, this.getByteOffset(), (byte)this.getBitOffset()); + public void endMerge(LexiconOutputStream lexStream) throws IOException{ + lexStream.writeNextEntry(lastTermWritten, new BasicLexiconEntry(currentTerm++, lastDocFreq, lastFreq, startOffset)); + //startOffset.setPosition(this.getByteOffset(), this.getBitOffset()); numberOfPointers += lastDocFreq; bos.close(); myRun.close(); diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/singlepass/hadoop/HadoopRunsMerger.java src/uk/ac/gla/terrier/structures/indexing/singlepass/hadoop/HadoopRunsMerger.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/indexing/singlepass/hadoop/HadoopRunsMerger.java 2009-02-16 21:43:03.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/indexing/singlepass/hadoop/HadoopRunsMerger.java 2009-03-03 14:34:49.000000000 +0000 @@ -30,6 +30,7 @@ import java.io.IOException; import java.util.LinkedList; import java.util.ListIterator; +import uk.ac.gla.terrier.structures.BasicLexiconEntry; import org.apache.hadoop.mapred.TaskID; import uk.ac.gla.terrier.structures.LexiconOutputStream; import uk.ac.gla.terrier.structures.indexing.singlepass.PostingInRun; @@ -62,9 +63,9 @@ mapData = _mapData; } - public void endMerge(LexiconOutputStream lexStream) {} + public void endMerge(LexiconOutputStream lexStream) {} - public void mergeOne(LexiconOutputStream lexStream) throws Exception + public void mergeOne(LexiconOutputStream lexStream) throws Exception { int maxDF = 0; RunIterator run = runsSource.createRunIterator(-1); @@ -74,6 +75,8 @@ lastTermWritten = null; lastFreq = 0; lastDocFreq= 0; + long startOffset = this.getByteOffset(); + byte startBitOffset = this.getBitOffset(); // for each run in the list int counter = 0; //for one term: for each set of postings for that term @@ -83,7 +86,6 @@ PostingInRun posting = run.next(); lastTermWritten = posting.getTerm(); final int reduceNumber = (TaskID.forName(_run.getMapNo()).getId()/partitionSize); - // if (posting.getDf() > maxDF) maxDF = posting.getDf(); @@ -129,7 +131,7 @@ lastDocFreq += posting.getDf(); counter++; } - lexStream.writeNextEntry(lastTermWritten, currentTerm++, lastDocFreq, lastFreq, this.getByteOffset(), (byte)this.getBitOffset()); + lexStream.writeNextEntry(lastTermWritten, new BasicLexiconEntry(currentTerm++, lastDocFreq, lastFreq, startOffset, startBitOffset)); numberOfPointers += lastDocFreq; } diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/maps/MapFile.java src/uk/ac/gla/terrier/structures/maps/MapFile.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/maps/MapFile.java 1970-01-01 01:00:00.000000000 +0100 +++ src/uk/ac/gla/terrier/structures/maps/MapFile.java 2009-03-03 14:34:49.000000000 +0000 @@ -0,0 +1,672 @@ +package uk.ac.gla.terrier.structures.maps; + +import java.io.Closeable; +import java.io.DataInput; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.AbstractCollection; +import java.util.AbstractSet; +import java.util.Collection; +import java.util.Iterator; +import java.util.Map; +import java.util.NoSuchElementException; +import java.util.Set; + +import org.apache.hadoop.io.Writable; +import org.apache.hadoop.io.WritableComparable; +import org.apache.log4j.Logger; + +import uk.ac.gla.terrier.structures.seralization.FixedSizeWriteableFactory; +import uk.ac.gla.terrier.structures.seralization.WriteableFactory; +import uk.ac.gla.terrier.utility.Files; +import uk.ac.gla.terrier.utility.io.RandomDataInput; +import uk.ac.gla.terrier.utility.io.RandomDataOutput; + + +public class MapFile< + K extends WritableComparable, + V extends Writable + > + implements OrderedMap, Closeable +{ + /** The logger used for this class */ + protected static final Logger logger = Logger.getLogger(MapFile.class); + + public interface MapFileBSearchShortcut + { + public int[] searchBounds(KEY key) throws IOException; + } + + interface OrderedMapEntry extends Entry + { + public int getIndex(); + } + + + + class DefaultMapFileBSearchShortcut implements MapFileBSearchShortcut + { + final int[] defaultBounds = new int[]{0,numberOfEntries}; + public int[] searchBounds(KEY key) + { + return defaultBounds; + } + } + + /** an iterator for entries. */ + public static class EntryIterator + implements Iterator>, java.io.Closeable + { + protected DataInput di; + protected int numEntries; + protected int counter = 0; + protected WriteableFactory keyFactory; + protected WriteableFactory valueFactory; + + public EntryIterator(String filename, FixedSizeWriteableFactory _keyFactory, FixedSizeWriteableFactory _valueFactory) + throws IOException + { + this( + new DataInputStream(Files.openFileStream(filename)), + (int)(Files.length(filename)/( _keyFactory.getSize() + _valueFactory.getSize() )), + _keyFactory, + _valueFactory + ); + } + + EntryIterator(DataInput _di, int _numEntries, WriteableFactory _keyFactory, WriteableFactory _valueFactory) + { + di = _di; + numEntries = _numEntries; + this.keyFactory = _keyFactory; + this.valueFactory = _valueFactory; + } + + public void close() throws IOException + { + ((Closeable)di).close(); + } + + public boolean hasNext() + { + //System.err.println(this.toString()+"check:"+(counter < numEntries)+" counter="+counter + " numEntries="+numEntries); + //new Exception().printStackTrace(); + return counter < numEntries; + } + + public Entry next() + { + //System.err.println(this.toString()+"counter="+counter + " numEntries="+numEntries); + if (counter >= numEntries) + { + //System.err.println(this.toString()+"ERROR counter="+counter + " numEntries="+numEntries); + throw new NoSuchElementException(); + } + IK key = keyFactory.newInstance(); + IV value = valueFactory.newInstance(); + try{ + key.readFields(di); + value.readFields(di); + counter++; + } catch (IOException ioe) { + logger.error("IOException while iterating", ioe); + throw new NoSuchElementException("IOException while iterating"); + } + if ((counter == numEntries) && di instanceof Closeable) + try{ + ((Closeable)di).close(); + } catch (IOException ioe) {} + return new MapFileEntry(key,value,counter-1); + } + + public void remove() { throw new UnsupportedOperationException();} + } + + /** an iterator for entries. */ + class valueIterator implements Iterator + { + DataInput di; + int numEntries; + int count = 0; + K uselessKey; + + valueIterator(DataInput _di, int _numEntries) + { + di = _di; + numEntries = _numEntries; + uselessKey = keyFactory.newInstance(); + } + + public boolean hasNext() + { + return count < numEntries; + } + + public V next() + { + if (count++ >= numEntries) + throw new NoSuchElementException(); + V value = valueFactory.newInstance(); + try{ + uselessKey.readFields(di); + value.readFields(di); + } catch (IOException ioe) { + logger.error("IOException while iterating", ioe); + throw new NoSuchElementException("IOException while iterating"); + } + if ((count == numEntries) && di instanceof Closeable) + try{ + ((Closeable)di).close(); + } catch (IOException ioe) {} + return value; + } + + public void remove() { throw new UnsupportedOperationException();} + } + + /** an iterator for entries. */ + class keyIterator implements Iterator, Closeable + { + DataInput di; + int numEntries; + int count = 0; + V uselessValue; + + keyIterator(DataInput _di, int _numEntries) + { + di = _di; + numEntries = _numEntries; + uselessValue = valueFactory.newInstance(); + } + + public boolean hasNext() + { + return count < numEntries; + } + + public K next() + { + if (count++ >= numEntries) + throw new NoSuchElementException(); + K key = keyFactory.newInstance(); + try{ + key.readFields(di); + uselessValue.readFields(di); + } catch (IOException ioe) { + logger.error("IOException while iterating", ioe); + throw new NoSuchElementException("IOException while iterating"); + } + if ((count == numEntries) && di instanceof Closeable) + try{ + ((Closeable)di).close(); + } catch (IOException ioe) {} + return key; + } + + public void remove() { throw new UnsupportedOperationException();} + public void close() + { + if (di instanceof Closeable) + try{ + ((Closeable)di).close(); + } catch (IOException ioe) {} + } + } + + + class MapFileEntrySet extends AbstractSet> + { + public boolean add(Map.Entry e) + { + put(e.getKey(), e.getValue()); + return true; + } + + public int size() + { + return numberOfEntries; + } + + public boolean isEmpty() + { + return size() == 0; + } + + public Iterator> iterator() + { + try{ + return new EntryIterator( + new DataInputStream(Files.openFileStream(dataFilename)), + numberOfEntries, + keyFactory, + valueFactory + ); + } catch (IOException ioe) { + return null; + } + } + + @SuppressWarnings("unchecked") + public boolean contains(Object o) + { + K key = (K)o; + if (get(key) == null) + return false; + return true; + } + + public boolean remove(Map.Entry e) + { + remove(e.getKey()); + return true; + } + + public void clear() + { + _clear(); + } + } + + class MapFileKeySet extends AbstractSet + { + public int size() + { + return numberOfEntries; + } + + public boolean isEmpty() + { + return size() == 0; + } + + public Iterator iterator() + { + try{ + return new keyIterator( + new DataInputStream(Files.openFileStream(dataFilename)), + numberOfEntries + ); + } catch (IOException ioe) { + return null; + } + } + + @SuppressWarnings("unchecked") + public boolean contains(Object o) + { + K key = (K)o; + if (get(key) == null) + return false; + return true; + } + } + + + + static class MapFileEntry implements OrderedMapEntry + { + EK key; + EV value; + int index; + MapFileEntry(EK _key, EV _value, int _index) + { + this.key = _key; + this.value = _value; + this.index = _index; + } + + public EK getKey() + { + return key; + } + + public int getIndex() + { + return index; + } + + public EV getValue() + { + return value; + } + + public EV setValue(EV value) + { + + //TODO why does this cause exception? + //put(this.key, value); + return null; + } + + public String toString() + { + return "Entry<"+key.toString() + ","+value.toString()+">"; + } + + @SuppressWarnings("unchecked") + public boolean equals(Object o) + { + Map.Entry e1 = this; + Map.Entry e2 = (Map.Entry)o; + return (e1.getKey()==null ? + e2.getKey()==null : e1.getKey().equals(e2.getKey())) && + (e1.getValue()==null ? + e2.getValue()==null : e1.getValue().equals(e2.getValue())); + } + + public int hashCode() + { + return + (getKey()==null ? 0 : getKey().hashCode()) ^ + (getValue()==null ? 0 : getValue().hashCode()); + } + } + + class MapFileValueCollection + extends AbstractCollection + implements Collection + { + public int size() + { + return numberOfEntries; + } + + public Iterator iterator() + { + try{ + return new valueIterator( + new DataInputStream(Files.openFileStream(dataFilename)), + numberOfEntries); + } catch (IOException ioe) { + logger.error("Problem reading MapFile "+dataFilename+" as stream", ioe); + return null; + } + } + } + + /** actual underlying data file */ + protected RandomDataInput dataFile = null; + /** filename of the underlying file */ + protected String dataFilename; + + /** The number of entries in the file.*/ + protected int numberOfEntries; + /** total size of one key,value pair */ + protected int entrySize; + + protected MapFileBSearchShortcut shortcut = new DefaultMapFileBSearchShortcut(); + + protected FixedSizeWriteableFactory keyFactory; + protected FixedSizeWriteableFactory valueFactory; + + protected RandomDataOutput write() + { + if (! (dataFile instanceof RandomDataOutput)) + throw new UnsupportedOperationException(); + return (RandomDataOutput)dataFile; + } + + public static int numberOfEntries( + String filename, + FixedSizeWriteableFactory _keyFactory, + FixedSizeWriteableFactory _valueFactory) + { + long length = Files.length(filename); + long entrySize = _keyFactory.getSize() + _valueFactory.getSize(); + return (int)(length/entrySize); + } + + public MapFile( + String filename, + boolean updateable, + FixedSizeWriteableFactory _keyFactory, + FixedSizeWriteableFactory _valueFactory) + throws IOException + { + this.dataFile = updateable + ? Files.writeFileRandom(this.dataFilename = filename) + : Files.openFileRandom(this.dataFilename = filename); + this.keyFactory = _keyFactory; + this.valueFactory = _valueFactory; + this.entrySize = _keyFactory.getSize() + _valueFactory.getSize(); + //System.err.println("MapFile entrySize is "+ this.entrySize); + this.numberOfEntries = (int) (dataFile.length() / (long)entrySize); + } + + public WriteableFactory getKeyFactory() { + return this.keyFactory; + } + + public WriteableFactory getValueFactory() { + return this.valueFactory; + } + + public void clear() + { + _clear(); + } + + + //renamed so that inner classes can access + protected void _clear() + { + RandomDataOutput _dataFile = write(); + try{ + _dataFile.setLength(0); + numberOfEntries = 0; + } catch (IOException ioe) { + logger.warn("Could not clear MapFile", ioe); + } + } + + public Set> entrySet() + { + return new MapFileEntrySet(); + } + + public Set keySet() + { + return new MapFileKeySet(); + } + + public Collection values() + { + return new MapFileValueCollection(); + } + + public int size() + { + return numberOfEntries; + } + + public boolean containsValue(Object o) + { + throw new UnsupportedOperationException(); + } + + @SuppressWarnings("unchecked") + public boolean containsKey(Object o) + { + return getEntry((K)o) != null; + } + + public boolean isEmpty() + { + return numberOfEntries == 0; + } + + public V put(K key, V value) + { + //RandomDataOutput _dataFile = write(); + throw new UnsupportedOperationException(); + } + + public V remove(Object _key) + { + //K key = (K)_key; + //RandomDataOutput _dataFile = write(); + throw new UnsupportedOperationException(); + } + + public void setBSearchShortcut(MapFileBSearchShortcut _shortcut) + { + this.shortcut = _shortcut; + } + + /** this method is the one which does the actual disk lookup of entries */ + protected Entry getEntry(K key) + { + int[] bounds; + try{ + bounds = shortcut.searchBounds(key); + } catch (IOException ioe) { + bounds = new int[]{0, numberOfEntries}; + } + int low = bounds[0]; + int high = bounds[1]; + + int i; + int compareEntry; + + K testKey = keyFactory.newInstance(); + V value = valueFactory.newInstance(); + + try{ + + while (high-low>1) + { + i = (high + low)/2; + if (i==0) { + dataFile.seek(0); + testKey.readFields(dataFile); + } else { + dataFile.seek((long)i * entrySize); + testKey.readFields(dataFile); + } + //System.err.println("Checking "+testKey.toString()); + if ((compareEntry = key.compareTo(testKey))< 0) + high = i; + else if (compareEntry > 0) + low = i; + else + { + //read the rest and return the data + value.readFields(dataFile); + return new MapFileEntry(testKey, value, i); + } + } + + if (high == numberOfEntries) + return null; + + if (high == 0) { + i = 0; + dataFile.seek(0); + } else { + i = high; + dataFile.seek((long)high * entrySize); + } + testKey.readFields(dataFile); + value.readFields(dataFile); + + if (key.compareTo(testKey) == 0) { + return new MapFileEntry(testKey, value, i); + } + } catch (IOException ioe) { + logger.error("IOException reading MapFile", ioe); + } + return null; + } + + @SuppressWarnings("unchecked") + public V get(Object _key) + { + K key = (K)_key; + Map.Entry entry = getEntry(key); + if (entry == null) + return null; + //System.err.println(key.toString() + "=" + entry.getValue().toString()); + return entry.getValue(); + } + + public Entry get(int entryNumber) + { + K key = keyFactory.newInstance(); + V value = valueFactory.newInstance(); + if (entryNumber > numberOfEntries) + throw new NoSuchElementException(); + + try{ + dataFile.seek((long)entryNumber * entrySize); + key.readFields(dataFile); + value.readFields(dataFile); + } catch (IOException ioe) { + throw new NoSuchElementException( + "IOException reading MapFile: "+ioe); + } + return new MapFileEntry(key, value, entryNumber); + } + + public void putAll(Map m) + { + for (Map.Entry e : m.entrySet()) + put(e.getKey(), e.getValue()); + } + + public void close() throws IOException + { + dataFile.close(); + } + + /** writes an entire map MapFile at once, to the specified filename, + * and using the data contained in the specified iterator + */ + public static void mapFileWrite(String filename, + Iterable> t) throws IOException + { + mapFileWrite(filename, t.iterator()); + } + + public static void mapFileWrite(String filename, + Iterator> ti) + throws IOException + { + DataOutputStream out = new DataOutputStream(Files.writeFileStream(filename)); + while (ti.hasNext()) + { + Entry e = ti.next(); + e.getKey().write(out); + e.getValue().write(out); + } + out.close(); + } + + /** returns a utility class which can be used to write a MapFile */ + public static MapFileWriter mapFileWrite(final String filename) + throws IOException + { + return new MapFileWriter(){ + DataOutputStream out = new DataOutputStream(Files.writeFileStream(filename)); + public void write(WritableComparable key, Writable value) + throws IOException + { + //System.err.println("writing key "+ key.toString()); + key.write(out); + //System.err.println("writing value "+ value.toString()); + value.write(out); + } + + public void close() throws IOException + { + out.close(); + } + }; + } + + public interface MapFileWriter extends Closeable + { + public void write(WritableComparable key, Writable value) + throws IOException; + } +} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/maps/OrderedMap.java src/uk/ac/gla/terrier/structures/maps/OrderedMap.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/maps/OrderedMap.java 1970-01-01 01:00:00.000000000 +0100 +++ src/uk/ac/gla/terrier/structures/maps/OrderedMap.java 2009-03-03 14:34:49.000000000 +0000 @@ -0,0 +1,8 @@ +package uk.ac.gla.terrier.structures.maps; +import java.util.Map; + +public interface OrderedMap extends Map +{ + /** Return the entry at the specified index */ + public Map.Entry get(int index); +} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/merging/BlockStructureMerger.java src/uk/ac/gla/terrier/structures/merging/BlockStructureMerger.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/merging/BlockStructureMerger.java 2009-01-28 20:17:00.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/merging/BlockStructureMerger.java 2009-03-03 14:34:49.000000000 +0000 @@ -26,8 +26,8 @@ package uk.ac.gla.terrier.structures.merging; import java.io.IOException; import java.util.Date; + import uk.ac.gla.terrier.compression.BitOut; -import uk.ac.gla.terrier.compression.BitOutputStream; import uk.ac.gla.terrier.sorting.SortAscendingQuadrupleVectors; import uk.ac.gla.terrier.sorting.SortAscendingQuintupleVectors; import uk.ac.gla.terrier.structures.BlockDirectInvertedOutputStream; @@ -53,22 +53,7 @@ */ public class BlockStructureMerger extends StructureMerger { - /** - * A constructor that sets the filenames of the inverted - * files to merge - * @param _filename1 the first inverted file to merge - * @param _filename2 the second inverted file to merge - * @deprecated - */ - public BlockStructureMerger(String _filename1, String _filename2) { - super(_filename1, _filename2); - directFileOutputStreamClass = BlockDirectInvertedOutputStream.class; - directFileInputClass = "uk.ac.gla.terrier.structures.BlockDirectIndex"; - directFileInputStreamClass = "uk.ac.gla.terrier.structures.BlockDirectIndexInputStream"; - invertedFileOutputStreamClass = BlockDirectInvertedOutputStream.class; - invertedFileInputClass = "uk.ac.gla.terrier.structures.BlockInvertedIndex"; - invertedFileInputStreamClass = "uk.ac.gla.terrier.structures.BlockInvertedIndexInputStream"; - } + public BlockStructureMerger(Index _srcIndex1, Index _srcIndex2, Index _destIndex) { @@ -81,15 +66,7 @@ invertedFileInputStreamClass = "uk.ac.gla.terrier.structures.BlockInvertedIndexInputStream"; } - /** write Block postings. - * @deprecated Use BlockDirectInvertedOutputStream instead */ - public static void writeBlockPostings(int[][] postings, int firstId, BitOutputStream output, int binaryBits) - throws IOException { - if (binaryBits>0) - writeFieldPostings(postings, firstId, output, binaryBits); - else - writeNoFieldPostings(postings, firstId, output); - } + /** @@ -268,7 +245,8 @@ long start = System.currentTimeMillis(); logger.info("started at " + (new Date())); if (ApplicationSetup.getProperty("merger.onlylexicons","false").equals("true")) { - sMerger.mergeLexicons(); + System.err.println("Use LexiconMerger"); + return; } else if (ApplicationSetup.getProperty("merger.onlydocids","false").equals("true")) { sMerger.mergeDocumentIndexFiles(); } else { @@ -279,105 +257,6 @@ long end = System.currentTimeMillis(); logger.info("time elapsed: " + ((end-start)*1.0d/1000.0d) + " sec."); } - - - - - - - - - - - - - /** write Block postings with fields. - * @deprecated Use BlockDirectInvertedOutputStream instead */ - public static void writeFieldPostings(int[][] postings, int firstId, final BitOutputStream output, final int binaryBits) - throws IOException { - - //local variables in order to reduce the number - //of times we need to access a two-dimensional array - final int[] postings0 = postings[0]; - final int[] postings1 = postings[1]; - final int[] postings2 = postings[2]; - final int[] postings3 = postings[3]; - final int[] postings4 = postings[4]; - - //write the first posting from the term's postings list - output.writeGamma(firstId); //write document id - output.writeUnary(postings1[0]); //write frequency - output.writeBinary(binaryBits, postings2[0]); //write fields if binaryBits>0 - int blockIndex = 0; //the index of the current block id - int blockFrequency = postings3[0]; //the number of block ids to write - output.writeUnary(blockFrequency); //write block frequency - output.writeGamma(postings4[blockIndex]+1); //write the first block id - blockIndex++; //move to the next block id - for (int i=1; i0 - blockFrequency = postings3[k]; //number of block ids to write - output.writeUnary(blockFrequency); //write block frequency - output.writeGamma(postings4[blockIndex]+1); //write the first block id - blockIndex++; //move to the next block id - for (int i=1; i> lexInStream1 = + (Iterator>)srcIndex1.getIndexStructureInputStream("lexicon"); + Iterator> lexInStream2 = + (Iterator>)srcIndex2.getIndexStructureInputStream("lexicon"); - - //setting the output stream - LexiconOutputStream lexOutStream = UTFIndexing - ? new UTFLexiconOutputStream(destIndex.getPath(), destIndex.getPrefix()) - : new LexiconOutputStream(destIndex.getPath(), destIndex.getPrefix()); - int hasMore1 = -1; - int hasMore2 = -1; + destIndex.setIndexProperty("lexicon-keyfactory", srcIndex1.getIndexProperty("lexicon-keyfactory", null)); + destIndex.setIndexProperty("lexicon-valuefactory", srcIndex1.getIndexProperty("lexicon-valuefactory", null)); + + + //setting the output stream + LexiconOutputStream lexOutStream = new MapFileLexiconOutputStream( + destIndex.getPath(), destIndex.getPrefix(), + "lexicon", + (FixedSizeWriteableFactory)destIndex.getIndexStructure("lexicon-keyfactory")); + + boolean hasMore1 = false; + boolean hasMore2 = false; String term1; String term2; int termId = 0; - hasMore1 = lexInStream1.readNextEntry(); - hasMore2 = lexInStream2.readNextEntry(); - while (hasMore1 >=0 && hasMore2 >= 0) { - term1 = lexInStream1.getTerm(); - term2 = lexInStream2.getTerm(); - //System.out.println("term1 : " + term1 + "with id " + lexInStream1.getTermId()); - //System.out.println("term2 : " + term2 + "with id " + lexInStream2.getTermId()); + hasMore1 = lexInStream1.hasNext(); + hasMore2 = lexInStream2.hasNext(); + Map.Entry lee1 = null; + Map.Entry lee2 = null; + while (hasMore1 && hasMore2) { + lee1 = lexInStream1.next(); + lee2 = lexInStream2.next(); + + term1 = lee1.getKey(); + term2 = lee2.getKey(); int lexicographicalCompare = term1.compareTo(term2); if (lexicographicalCompare < 0) { - - lexOutStream.writeNextEntry(term1, - termId, - lexInStream1.getNt(), - lexInStream1.getTF(), - 0L, - (byte)0); + lee1.getValue().setTermId(termId); + lee1.getValue().setPosition(0, (byte)0); + lexOutStream.writeNextEntry(term1, lee1.getValue()); termId++; - hasMore1 = lexInStream1.readNextEntry(); + hasMore1 = lexInStream1.hasNext(); } else if (lexicographicalCompare > 0) { - - lexOutStream.writeNextEntry(term2, - termId, - lexInStream2.getNt(), - lexInStream2.getTF(), - 0L, - (byte)0); + lee2.getValue().setTermId(termId); + lee2.getValue().setPosition(0, (byte)0); + lexOutStream.writeNextEntry(term2, lee2.getValue()); termId++; - hasMore2 = lexInStream2.readNextEntry(); + hasMore2 = lexInStream2.hasNext(); } else { - lexOutStream.writeNextEntry(term1, - termId, - (lexInStream1.getNt() + lexInStream2.getNt()), - (lexInStream1.getTF() + lexInStream2.getTF()), - 0L, - (byte)0); - hasMore1 = lexInStream1.readNextEntry(); - hasMore2 = lexInStream2.readNextEntry(); + lee1.getValue().setTermId(termId); + lee1.getValue().setPosition(0, (byte)0); + lee1.getValue().add(lee2.getValue()); + lexOutStream.writeNextEntry(term1, lee1.getValue()); + hasMore1 = lexInStream1.hasNext(); + hasMore2 = lexInStream2.hasNext(); termId++; } } - if (hasMore1 >= 0) { - while (hasMore1 >= 0) { - lexOutStream.writeNextEntry(lexInStream1.getTerm(), - termId, - lexInStream1.getNt(), - lexInStream1.getTF(), - 0L, - (byte)0); - hasMore1 = lexInStream1.readNextEntry(); + if (hasMore1) { + while (hasMore1) { + lee1.getValue().setTermId(termId); + lee1.getValue().setPosition(0, (byte)0); + lexOutStream.writeNextEntry(lee1.getKey(), lee1.getValue()); + hasMore1 = lexInStream1.hasNext(); termId++; } - } else if (hasMore2 >= 0) { - while (hasMore2 >= 0) { - lexOutStream.writeNextEntry(lexInStream2.getTerm(), - termId, - lexInStream2.getNt(), - lexInStream2.getTF(), - 0L, - (byte)0); - hasMore2 = lexInStream2.readNextEntry(); + } else if (hasMore2) { + while (hasMore2) { + lee1.getValue().setTermId(termId); + lee1.getValue().setPosition(0, (byte)0); + lexOutStream.writeNextEntry(lee2.getKey(), lee2.getValue()); + hasMore2 = lexInStream2.hasNext(); termId++; } } - lexInStream1.close(); - lexInStream2.close(); - destIndex.setIndexProperty("num.Pointers", ""+lexOutStream.getNumberOfPointersWritten()); - destIndex.setIndexProperty("num.Terms", ""+lexOutStream.getNumberOfTermsWritten()); - destIndex.setIndexProperty("num.Tokens", ""+lexOutStream.getNumberOfTokensWritten()); - destIndex.addIndexStructure("lexicon", UTFIndexing - ? "uk.ac.gla.terrier.structures.UTFLexicon" - : "uk.ac.gla.terrier.structures.Lexicon"); - destIndex.addIndexStructureInputStream("lexicon", UTFIndexing - ? "uk.ac.gla.terrier.structures.UTFLexiconInputStream" - : "uk.ac.gla.terrier.structures.LexiconInputStream"); + if (lexInStream1 instanceof Closeable) { + ((Closeable)lexInStream1).close(); + } + if (lexInStream2 instanceof Closeable) { + ((Closeable)lexInStream2).close(); + } lexOutStream.close(); + LexiconBuilder.optimise(destIndex, "lexicon"); destIndex.flush(); } catch(IOException ioe) { logger.error("IOException while merging lexicons.", ioe); } - // create an empty lexid file - //try{ - // BufferedWriter bw = new BufferedWriter(Files.writeFileWriter( - // this.lexiconFileOutput+"id")); - // bw.write(" "); - // bw.close(); - //} - //catch(IOException e){ - // e.printStackTrace(); - //} - try{ - LexiconBuilder.createLexiconIndex(destIndex); - if (USE_HASH) - LexiconBuilder.createLexiconHash(destIndex); - } catch (IOException ioe) { - logger.warn("Problems writing lexicon lexid or lexicon hash", ioe); - } } public static void main(String[] args) { diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/merging/StructureMerger.java src/uk/ac/gla/terrier/structures/merging/StructureMerger.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/merging/StructureMerger.java 2009-01-28 20:17:00.000000000 +0000 +++ src/uk/ac/gla/terrier/structures/merging/StructureMerger.java 2009-03-03 14:34:49.000000000 +0000 @@ -30,25 +30,28 @@ import java.io.IOException; import java.util.Date; +import java.util.Iterator; +import java.util.Map; + +import org.apache.log4j.Logger; -import uk.ac.gla.terrier.compression.BitOut; import uk.ac.gla.terrier.sorting.SortAscendingPairedVectors; import uk.ac.gla.terrier.sorting.SortAscendingTripleVectors; +import uk.ac.gla.terrier.structures.Closeable; import uk.ac.gla.terrier.structures.DirectIndex; import uk.ac.gla.terrier.structures.DirectInvertedOutputStream; import uk.ac.gla.terrier.structures.DocumentIndexInputStream; import uk.ac.gla.terrier.structures.FilePosition; import uk.ac.gla.terrier.structures.Index; import uk.ac.gla.terrier.structures.InvertedIndex; -import uk.ac.gla.terrier.structures.LexiconInputStream; +import uk.ac.gla.terrier.structures.LexiconEntry; import uk.ac.gla.terrier.structures.LexiconOutputStream; -import uk.ac.gla.terrier.structures.UTFLexiconOutputStream; +import uk.ac.gla.terrier.structures.MapFileLexiconOutputStream; import uk.ac.gla.terrier.structures.indexing.DocumentIndexBuilder; import uk.ac.gla.terrier.structures.indexing.LexiconBuilder; +import uk.ac.gla.terrier.structures.seralization.FixedSizeWriteableFactory; import uk.ac.gla.terrier.utility.ApplicationSetup; -import org.apache.log4j.Logger; - /** * This class merges the structures created by Terrier, so that * we use fewer and larger inverted and direct files. @@ -63,8 +66,6 @@ public class StructureMerger { /** use UTF supporting lexicon */ protected final boolean UTFIndexing = Boolean.parseBoolean(ApplicationSetup.getProperty("string.use_utf", "false")); - /** build a lexicon hash */ - protected boolean USE_HASH = Boolean.parseBoolean(ApplicationSetup.getProperty("lexicon.use.hash","true")); /** the logger used */ protected static Logger logger = Logger.getRootLogger(); @@ -128,34 +129,6 @@ numberOfTerms = 0; } - protected static String[] getIndexPathPrefix(String _IFfilename) - { - - String parts[] = _IFfilename.split(ApplicationSetup.FILE_SEPARATOR); - String path = _IFfilename.replaceFirst(parts[parts.length -1]+"$", ""); - String prefix = parts[parts.length -1].replaceAll(ApplicationSetup.IFSUFFIX+"$", ""); - return new String[]{path,prefix}; - } - - /** - * A constructor that sets the filenames of the inverted - * files to merge - * @param _srcfilename1 the first inverted file to merge - * @param _srcfilename2 the second inverted file to merge - * @deprecated - */ - public StructureMerger(String _srcfilename1, String _srcfilename2) { - String[] p1 = getIndexPathPrefix(_srcfilename1); - String[] p2 = getIndexPathPrefix(_srcfilename2); - srcIndex1 = Index.createIndex(p1[0], p1[1]); - srcIndex2 = Index.createIndex(p2[0], p2[1]); - - //invertedFile1 = _filename1; - //invertedFile2 = _filename2; - numberOfDocuments = 0; - numberOfPointers = 0; - numberOfTerms = 0; - } /** * Sets the number of bits to write or read for binary encoded numbers @@ -165,16 +138,7 @@ binaryBits = bits; } - /** - * Sets the output filename of the merged inverted file - * @param _outputName the filename of the merged inverted file - * @deprecated - */ - public void setOutputFilename(String _outputName) { - //invertedFileOutput = _outputName; - String[] p = getIndexPathPrefix(_outputName); - destIndex = Index.createNewIndex(p[0], p[1]); - } + /** * Sets the output index. This index should have no documents @@ -191,6 +155,7 @@ * lexicon are ot correct. They will be updated only after creating the * inverted file. */ + @SuppressWarnings("unchecked") protected void mergeInvertedFiles() { try { //getting the number of entries in the first document index, @@ -208,14 +173,25 @@ termcodeHashmap = new TIntIntHashMap(); //setting the input streams - LexiconInputStream lexInStream1 = (LexiconInputStream)srcIndex1.getIndexStructureInputStream("lexicon"); - LexiconInputStream lexInStream2 = (LexiconInputStream)srcIndex2.getIndexStructureInputStream("lexicon"); - - LexiconOutputStream lexOutStream = UTFIndexing - ? new UTFLexiconOutputStream(destIndex.getPath(), destIndex.getPrefix()) - : new LexiconOutputStream(destIndex.getPath(), destIndex.getPrefix()); - - + Iterator> lexInStream1 = + (Iterator>)srcIndex1.getIndexStructureInputStream("lexicon"); + Iterator> lexInStream2 = + (Iterator>)srcIndex2.getIndexStructureInputStream("lexicon"); + + for(String property : new String[] {"index.lexicon-keyfactory.class", "index.lexicon-keyfactory.parameter_values", + "index.lexicon-keyfactory.parameter_types", "index.lexicon-valuefactory.class", "index.lexicon-valuefactory.parameter_values", + "index.lexicon-valuefactory.parameter_types"} ) + { + destIndex.setIndexProperty(property, srcIndex1.getIndexProperty(property, null)); + } + + FixedSizeWriteableFactory lvf = + (FixedSizeWriteableFactory)srcIndex1.getIndexStructure("lexicon-valuefactory"); + + //setting the output stream + LexiconOutputStream lexOutStream = + new MapFileLexiconOutputStream(destIndex, "lexicon", (Class >) lvf.getClass()); + int newCodes = (int)srcIndex1.getCollectionStatistics().getNumberOfUniqueTerms(); InvertedIndex inverted1 = srcIndex1.getInvertedIndex(); @@ -227,7 +203,7 @@ (DirectInvertedOutputStream)invertedFileOutputStreamClass .getConstructor(String.class,Integer.TYPE) .newInstance(destIndex.getPath() + ApplicationSetup.FILE_SEPARATOR + - destIndex.getPrefix() + ApplicationSetup.IFSUFFIX, + destIndex.getPrefix() + ".inverted.bf", binaryBits); } catch (Exception e) { logger.error("Couldn't create specified DirectInvertedOutputStream", e); @@ -237,162 +213,166 @@ //BitOut invertedOutput = new BitOutputStream( // ); - int hasMore1 = -1; - int hasMore2 = -1; + boolean hasMore1 = false; + boolean hasMore2 = false; String term1; String term2; - - hasMore1 = lexInStream1.readNextEntry(); - hasMore2 = lexInStream2.readNextEntry(); - while (hasMore1 >=0 && hasMore2 >= 0) { - term1 = lexInStream1.getTerm(); - term2 = lexInStream2.getTerm(); + Map.Entry lee1 = null; + Map.Entry lee2 = null; + hasMore1 = lexInStream1.hasNext(); + if (hasMore1) + lee1 = lexInStream1.next(); + hasMore2 = lexInStream2.hasNext(); + if (hasMore2) + lee2 = lexInStream2.next(); + while (hasMore1 && hasMore2) { + + term1 = lee1.getKey(); + term2 = lee2.getKey(); int lexicographicalCompare = term1.compareTo(term2); - //System.err.println("Comparing "+lexInStream1.getTermId() +":"+ term1 + " with "+lexInStream2.getTermId()+ ":"+ term2 + " results="+lexicographicalCompare); if (lexicographicalCompare < 0) { //write to inverted file as well. - int[][] docs = inverted1.getDocuments(lexInStream1.getTermId()); + + int[][] docs = inverted1.getDocuments(lee1.getValue()); + long startOffset = invOS.getByteOffset(); + byte startBitOffset = invOS.getBitOffset(); + invOS.writePostings(docs, docs[0][0]+1); - //writePostings(docs, docs[0][0]+1, invertedOutput, binaryBits); numberOfPointers+=docs[0].length; - long endOffset = invOS.getByteOffset(); - byte endBitOffset = invOS.getBitOffset(); - endBitOffset--; - if (endBitOffset < 0 && endOffset > 0) { - endBitOffset = 7; - endOffset--; - } - - lexOutStream.writeNextEntry(term1, - lexInStream1.getTermId(), - lexInStream1.getNt(), - lexInStream1.getTF(), - endOffset, - endBitOffset); - hasMore1 = lexInStream1.readNextEntry(); +// long endOffset = invOS.getByteOffset(); +// byte endBitOffset = invOS.getBitOffset(); +// endBitOffset--; +// if (endBitOffset < 0 && endOffset > 0) { +// endBitOffset = 7; +// endOffset--; +// } + lee1.getValue().setPosition(startOffset, startBitOffset); + lexOutStream.writeNextEntry(term1, lee1.getValue()); + hasMore1 = lexInStream1.hasNext(); + if (hasMore1) + lee1 = lexInStream1.next(); } else if (lexicographicalCompare > 0) { //write to inverted file as well. - int[][] docs = inverted2.getDocuments(lexInStream2.getTermId()); + int[][] docs = inverted2.getDocuments(lee2.getValue()); + long startOffset = invOS.getByteOffset(); + byte startBitOffset = invOS.getBitOffset(); invOS.writePostings(docs, docs[0][0]+numberOfDocs1+1); - //writePostings(docs, docs[0][0]+numberOfDocs1+1, invertedOutput, binaryBits); - numberOfPointers+=docs[0].length; - long endOffset = invOS.getByteOffset(); - byte endBitOffset = invOS.getBitOffset(); - endBitOffset--; - if (endBitOffset < 0 && endOffset > 0) { - endBitOffset = 7; - endOffset--; - } + numberOfPointers+=docs[0].length; +// long endOffset = invOS.getByteOffset(); +// byte endBitOffset = invOS.getBitOffset(); +// +// endBitOffset--; +// if (endBitOffset < 0 && endOffset > 0) { +// endBitOffset = 7; +// endOffset--; +// } int newCode = newCodes++; if (keepTermCodeMap) - termcodeHashmap.put(lexInStream2.getTermId(), newCode); - - lexOutStream.writeNextEntry(term2, - newCode, - lexInStream2.getNt(), - lexInStream2.getTF(), - endOffset, - endBitOffset); - hasMore2 = lexInStream2.readNextEntry(); + termcodeHashmap.put(lee2.getValue().getTermId(), newCode); + lee2.getValue().setTermId(newCode); + lee2.getValue().setPosition(startOffset, startBitOffset); + lexOutStream.writeNextEntry(term2, lee2.getValue()); + hasMore2 = lexInStream2.hasNext(); + if (hasMore2) + lee2 = lexInStream2.next(); } else { //write to inverted file as well. - int[][] docs1 = inverted1.getDocuments(lexInStream1.getTermId()); - int[][] docs2 = inverted2.getDocuments(lexInStream2.getTermId()); + int[][] docs1 = inverted1.getDocuments(lee1.getValue()); + int[][] docs2 = inverted2.getDocuments(lee2.getValue()); + long startOffset = invOS.getByteOffset(); + byte startBitOffset = invOS.getBitOffset(); invOS.writePostings(docs1, docs1[0][0]+1); - //writePostings(docs1, docs1[0][0]+1, invertedOutput, binaryBits); numberOfPointers+=docs1[0].length; invOS.writePostings(docs2, docs2[0][0] + numberOfDocs1 - docs1[0][docs1[0].length-1]); - //writePostings(docs2, docs2[0][0] + numberOfDocs1 - docs1[0][docs1[0].length-1], - // invertedOutput, binaryBits); numberOfPointers+=docs2[0].length; - long endOffset = invOS.getByteOffset(); - byte endBitOffset = invOS.getBitOffset(); - endBitOffset--; - if (endBitOffset < 0 && endOffset > 0) { - endBitOffset = 7; - endOffset--; - } +// long endOffset = invOS.getByteOffset(); +// byte endBitOffset = invOS.getBitOffset(); +// endBitOffset--; +// if (endBitOffset < 0 && endOffset > 0) { +// endBitOffset = 7; +// endOffset--; +// } + - int newCode = lexInStream1.getTermId(); + lee1.getValue().setPosition(startOffset, startBitOffset); + int newCode = lee1.getValue().getTermId(); if (keepTermCodeMap) - termcodeHashmap.put(lexInStream2.getTermId(), newCode); + termcodeHashmap.put(lee2.getValue().getTermId(), newCode); - lexOutStream.writeNextEntry(term1, - newCode, - (lexInStream1.getNt() + lexInStream2.getNt()), - (lexInStream1.getTF() + lexInStream2.getTF()), - endOffset, - endBitOffset); - hasMore1 = lexInStream1.readNextEntry(); - hasMore2 = lexInStream2.readNextEntry(); + lee1.getValue().add(lee2.getValue()); + lexOutStream.writeNextEntry(term1, lee1.getValue()); + hasMore1 = lexInStream1.hasNext(); + if (hasMore1) + lee1 = lexInStream1.next(); + + hasMore2 = lexInStream2.hasNext(); + if (hasMore2) + lee2 = lexInStream2.next(); } } - if (hasMore1 >= 0) { - while (hasMore1 >= 0) { - + if (hasMore1) { + while (hasMore1) { + lee1 = lexInStream1.next(); //write to inverted file as well. - int[][] docs = inverted1.getDocuments(lexInStream1.getTermId()); + int[][] docs = inverted1.getDocuments(lee1.getValue()); + long startOffset = invOS.getByteOffset(); + byte startBitOffset = invOS.getBitOffset(); invOS.writePostings(docs, docs[0][0]+1); - //writePostings(docs, docs[0][0]+1, invertedOutput, binaryBits); numberOfPointers+=docs[0].length; - long endOffset = invOS.getByteOffset(); - byte endBitOffset = invOS.getBitOffset(); - //long endOffset = invertedOutput.getByteOffset(); - //byte endBitOffset = invertedOutput.getBitOffset(); - endBitOffset--; - if (endBitOffset < 0 && endOffset > 0) { - endBitOffset = 7; - endOffset--; - } - - lexOutStream.writeNextEntry(lexInStream1.getTerm(), - lexInStream1.getTermId(), - lexInStream1.getNt(), - lexInStream1.getTF(), - endOffset, - endBitOffset); - hasMore1 = lexInStream1.readNextEntry(); +// long endOffset = invOS.getByteOffset(); +// byte endBitOffset = invOS.getBitOffset(); +// endBitOffset--; +// if (endBitOffset < 0 && endOffset > 0) { +// endBitOffset = 7; +// endOffset--; +// } + lee1.getValue().setPosition(startOffset, startBitOffset); + lexOutStream.writeNextEntry(lee1.getKey(), lee1.getValue()); + hasMore1 = lexInStream1.hasNext(); + if (hasMore1) + lee1 = lexInStream1.next(); } - } else if (hasMore2 >= 0) { - while (hasMore2 >= 0) { + } else if (hasMore2) { + while (hasMore2) { //write to inverted file as well. - int[][] docs = inverted2.getDocuments(lexInStream2.getTermId()); + int[][] docs = inverted2.getDocuments(lee2.getValue()); + long startOffset = invOS.getByteOffset(); + byte startBitOffset = invOS.getBitOffset(); invOS.writePostings(docs, docs[0][0]+numberOfDocs1+1); - //writePostings(docs, docs[0][0]+numberOfDocs1+1, invertedOutput, binaryBits); + numberOfPointers+=docs[0].length; - long endOffset = invOS.getByteOffset(); - byte endBitOffset = invOS.getBitOffset(); - - //long endOffset = invertedOutput.getByteOffset(); - //byte endBitOffset = invertedOutput.getBitOffset(); - endBitOffset--; - if (endBitOffset < 0 && endOffset > 0) { - endBitOffset = 7; - endOffset--; - } +// long endOffset = invOS.getByteOffset(); +// byte endBitOffset = invOS.getBitOffset(); +// endBitOffset--; +// if (endBitOffset < 0 && endOffset > 0) { +// endBitOffset = 7; +// endOffset--; +// } int newCode = newCodes++; if (keepTermCodeMap) - termcodeHashmap.put(lexInStream2.getTermId(), newCode); - - lexOutStream.writeNextEntry(lexInStream2.getTerm(), - newCode, - lexInStream2.getNt(), - lexInStream2.getTF(), - endOffset, - endBitOffset); - hasMore2 = lexInStream2.readNextEntry(); + termcodeHashmap.put(lee2.getValue().getTermId(), newCode); + lee2.getValue().setTermId(newCode); + lee2.getValue().setPosition(startOffset, startBitOffset); + lexOutStream.writeNextEntry(lee2.getKey(), lee2.getValue()); + hasMore2 = lexInStream2.hasNext(); + if (hasMore2) + lee2 = lexInStream2.next(); } } - lexInStream1.close(); - lexInStream2.close(); + if (lexInStream1 instanceof Closeable) { + ((Closeable)lexInStream1).close(); + } + if (lexInStream2 instanceof Closeable) { + ((Closeable)lexInStream2).close(); + } inverted1.close(); @@ -400,25 +380,16 @@ invOS.close(); destIndex.setIndexProperty("num.Documents", ""+numberOfDocuments); - destIndex.setIndexProperty("num.Pointers", ""+lexOutStream.getNumberOfPointersWritten()); - destIndex.setIndexProperty("num.Terms", ""+lexOutStream.getNumberOfTermsWritten()); - destIndex.setIndexProperty("num.Tokens", ""+lexOutStream.getNumberOfTokensWritten()); - destIndex.addIndexStructure("lexicon", UTFIndexing - ? "uk.ac.gla.terrier.structures.UTFLexicon" - : "uk.ac.gla.terrier.structures.Lexicon"); - destIndex.addIndexStructureInputStream("lexicon", UTFIndexing - ? "uk.ac.gla.terrier.structures.UTFLexiconInputStream" - : "uk.ac.gla.terrier.structures.LexiconInputStream"); destIndex.addIndexStructure( "inverted", invertedFileInputClass, - "uk.ac.gla.terrier.structures.Lexicon,java.lang.String,java.lang.String", - "lexicon,path,prefix"); + "uk.ac.gla.terrier.structures.Index,java.lang.String", + "index,structureName"); destIndex.addIndexStructureInputStream( - "inverted", - invertedFileInputStreamClass, - "java.lang.String,java.lang.String,uk.ac.gla.terrier.structures.LexiconInputStream", - "path,prefix,lexicon-inputstream"); + "inverted", + invertedFileInputStreamClass, + "uk.ac.gla.terrier.structures.Index,java.lang.String,java.util.Iterator", + "index,structureName,lexicon-inputstream"); lexOutStream.close(); destIndex.flush(); @@ -427,129 +398,6 @@ } } - /** - * Merges the two lexicons into one. After this stage, the offsets in the - * lexicon are not correct. - */ - protected void mergeLexicons() { - try { - //getting the number of entries in the first document index, - //in order to assign the correct docids to the documents - //of the second inverted file. - - //creating a new map between new and old term codes - if (keepTermCodeMap) - termcodeHashmap = new TIntIntHashMap(); - - //setting the input streams - final LexiconInputStream lexInStream1 = (LexiconInputStream)srcIndex1.getIndexStructureInputStream("lexicon"); - final LexiconInputStream lexInStream2 = (LexiconInputStream)srcIndex2.getIndexStructureInputStream("lexicon"); - - final LexiconOutputStream lexOutStream = UTFIndexing - ? new UTFLexiconOutputStream(destIndex.getPath(), destIndex.getPrefix()) - : new LexiconOutputStream(destIndex.getPath(), destIndex.getPrefix()); - - - int newCodes = (int)srcIndex1.getCollectionStatistics().getNumberOfUniqueTerms(); - - int hasMore1 = -1; - int hasMore2 = -1; - String term1; - String term2; - - hasMore1 = lexInStream1.readNextEntry(); - hasMore2 = lexInStream2.readNextEntry(); - while (hasMore1 >=0 && hasMore2 >= 0) { - term1 = lexInStream1.getTerm(); - term2 = lexInStream2.getTerm(); - - int lexicographicalCompare = term1.compareTo(term2); - if (lexicographicalCompare < 0) { - - lexOutStream.writeNextEntry(term1, - lexInStream1.getTermId(), - lexInStream1.getNt(), - lexInStream1.getTF(), - 0L, - (byte)0); - hasMore1 = lexInStream1.readNextEntry(); - - } else if (lexicographicalCompare > 0) { - int newCode = newCodes++; - if (keepTermCodeMap) - termcodeHashmap.put(lexInStream2.getTermId(), newCode); - - lexOutStream.writeNextEntry(term2, - newCode, - lexInStream2.getNt(), - lexInStream2.getTF(), - 0L, - (byte)0); - hasMore2 = lexInStream2.readNextEntry(); - } else { - int newCode = lexInStream1.getTermId(); - if (keepTermCodeMap) - termcodeHashmap.put(lexInStream2.getTermId(), newCode); - - lexOutStream.writeNextEntry(term1, - newCode, - (lexInStream1.getNt() + lexInStream2.getNt()), - (lexInStream1.getTF() + lexInStream2.getTF()), - 0L, - (byte)0); - hasMore1 = lexInStream1.readNextEntry(); - hasMore2 = lexInStream2.readNextEntry(); - } - } - - if (hasMore1 >= 0) { - while (hasMore1 >= 0) { - - lexOutStream.writeNextEntry(lexInStream1.getTerm(), - lexInStream1.getTermId(), - lexInStream1.getNt(), - lexInStream1.getTF(), - 0L, - (byte)0); - hasMore1 = lexInStream1.readNextEntry(); - } - } else if (hasMore2 >= 0) { - while (hasMore2 >= 0) { - int newCode = newCodes++; - if (keepTermCodeMap) - termcodeHashmap.put(lexInStream2.getTermId(), newCode); - - lexOutStream.writeNextEntry(lexInStream2.getTerm(), - newCode, - lexInStream2.getNt(), - lexInStream2.getTF(), - 0L, - (byte)0); - hasMore2 = lexInStream2.readNextEntry(); - } - } - - lexInStream1.close(); - lexInStream2.close(); - - - destIndex.setIndexProperty("num.Documents", ""+numberOfDocuments); - destIndex.setIndexProperty("num.Pointers", ""+lexOutStream.getNumberOfPointersWritten()); - destIndex.setIndexProperty("num.Terms", ""+lexOutStream.getNumberOfTermsWritten()); - destIndex.setIndexProperty("num.Tokens", ""+lexOutStream.getNumberOfTokensWritten()); - destIndex.addIndexStructure("lexicon", UTFIndexing - ? "uk.ac.gla.terrier.structures.UTFLexicon" - : "uk.ac.gla.terrier.structures.Lexicon"); - destIndex.addIndexStructureInputStream("lexicon", UTFIndexing - ? "uk.ac.gla.terrier.structures.UTFLexiconInputStream" - : "uk.ac.gla.terrier.structures.LexiconInputStream"); - lexOutStream.close(); - destIndex.flush(); - - } catch(IOException ioe) { - logger.error("IOException while merging lexicons.", ioe); - } - } /** * Merges the two direct files and the corresponding document id files. @@ -710,17 +558,7 @@ * creates the final term code to offset file, and the lexicon hash if enabled. */ protected void createLexidFile() { - try { - LexiconBuilder.createLexiconIndex(destIndex); - } catch(IOException ioe) { - logger.error("IOException while creating lexid file.", ioe); - } - if (USE_HASH) - try{ - LexiconBuilder.createLexiconHash(destIndex); - } catch (IOException ioe) { - logger.error("IOException while creating lexicon hash file", ioe); - } + LexiconBuilder.optimise(destIndex, "lexicon"); } /** @@ -743,7 +581,7 @@ } else if (bothLexicon) { - mergeLexicons(); + new LexiconMerger(srcIndex1, srcIndex2, destIndex).mergeLexicons(); t2 = System.currentTimeMillis(); logger.info("merged lexicons in " + ((t2-t1)/1000.0d)); } @@ -804,7 +642,8 @@ long start = System.currentTimeMillis(); logger.info("started at " + (new Date())); if (ApplicationSetup.getProperty("merger.onlylexicons","false").equals("true")) { - sMerger.mergeLexicons(); + System.err.println("Use LexiconMerger"); + return; } else if (ApplicationSetup.getProperty("merger.onlydocids","false").equals("true")) { sMerger.mergeDocumentIndexFiles(); } else { @@ -819,84 +658,7 @@ logger.info("time elapsed: " + ((end-start)*1.0d/1000.0d) + " sec."); } - /** - * Writes the given postings to a bit file. Depending on - * the value of the field binaryBits, this method will call the - * appropriate method writeToInvertedFileFields, or - * writeToInvertedFileNoFields. - * @param postings the postings list to write. - * @param firstId the first identifier to write. This can be - * an id plus one, or the gap of the current id and the previous one. - * @param output the output bit file. - * @deprecated Please use DirectInvertedOutputStream instead - */ - public static void writePostings(int[][] postings, int firstId, BitOut output, int binaryBits) - throws IOException { - if (binaryBits>0) - writeFieldPostings(postings, firstId, output, binaryBits); - else - writeNoFieldPostings(postings, firstId, output); - } - - /** - * Writes the given postings to a bit file. This method assumes that - * field information is available as well. - * @param postings the postings list to write. - * @param firstId the first identifier to write. This can be - * an id plus one, or the gap of the current id and the previous one. - * @param output the output bit file. - * @deprecated use DirectInvertedIndexOutputStream - */ - public static void writeFieldPostings(int[][] postings, int firstId, BitOut output, int binaryBits) - throws IOException { - - //local variables in order to reduce the number - //of times we need to access a two-dimensional array - final int[] postings0 = postings[0]; - final int[] postings1 = postings[1]; - final int[] postings2 = postings[2]; - - //write the first entry - output.writeGamma(firstId); - output.writeUnary(postings1[0]); - output.writeBinary(binaryBits, postings2[0]); - - final int length = postings0.length; - for (int k = 1; k < length; k++) { - output.writeGamma(postings0[k] - postings0[k - 1]); - output.writeUnary(postings1[k]); - output.writeBinary(binaryBits, postings2[k]); - } - } - - /** - * Writes the given postings to a bit file. This method assumes that - * field information is not available. - * @param postings the postings list to write. - * @param firstId the first identifier to write. This can be - * an id plus one, or the gap of the current id and the previous one. - * @param output the output bit file. - * @throws IOException if an error occurs during writing to a file. - * @deprecated use DirectInvertedIndexOutputStream - */ - public static void writeNoFieldPostings(int[][] postings, int firstId, BitOut output) - throws IOException { - //local variables in order to reduce the number - //of times we need to access a two-dimensional array - final int[] postings0 = postings[0]; - final int[] postings1 = postings[1]; - - //write the first entry - output.writeGamma(firstId); - output.writeUnary(postings1[0]); - - final int length = postings[0].length; - for (int k = 1; k < length; k++) { - output.writeGamma(postings0[k] - postings0[k - 1]); - output.writeUnary(postings1[k]); - } - } } diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/seralization/FixedSizeTextFactory.java src/uk/ac/gla/terrier/structures/seralization/FixedSizeTextFactory.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/seralization/FixedSizeTextFactory.java 1970-01-01 01:00:00.000000000 +0100 +++ src/uk/ac/gla/terrier/structures/seralization/FixedSizeTextFactory.java 2009-03-03 14:34:49.000000000 +0000 @@ -0,0 +1,125 @@ +package uk.ac.gla.terrier.structures.seralization; + +import org.junit.Test; +import static org.junit.Assert.*; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.DataInput; +import java.io.DataInputStream; +import java.io.DataOutput; +import java.io.DataOutputStream; +import java.io.IOException; + +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.WritableUtils; + +public class FixedSizeTextFactory implements FixedSizeWriteableFactory { + class FixedSizeText extends Text { + public FixedSizeText() { + super(); + } + + public FixedSizeText(byte[] b) { + super(b); + } + + public FixedSizeText(String s) { + super(s); + } + + public FixedSizeText(Text t) { + super(t); + } + + @Override + public void readFields(DataInput in) throws IOException { + super.readFields(in); + //System.err.println("Term "+this.toString() + " read in "+ (this.getLength()+WritableUtils.getVIntSize(this.getLength())) + " bytes"); + in.skipBytes(maxKeyWrittenSize - (this.getLength()+WritableUtils.getVIntSize(this.getLength()))); + } + + @Override + public void write(DataOutput out) throws IOException { + super.write(out); + out.write(ZERO_BUFFER, 0, maxKeyWrittenSize - (this.getLength()+WritableUtils.getVIntSize(this.getLength()))); + //System.err.println("Term "+this.toString() + " written in "+ (this.getLength()+WritableUtils.getVIntSize(this.getLength())) + " bytes"); + } + } + + final byte[] ZERO_BUFFER; + final int termLength; + final int maxKeyWrittenSize; + + public FixedSizeTextFactory(String _termLength) + { + this(Integer.parseInt(_termLength)); + } + + public FixedSizeTextFactory(int _termLength) + { + this.termLength = _termLength; //TODO : consider non-utf terms - need to derive maximum size + this.maxKeyWrittenSize = WritableUtils.getVIntSize(termLength) + 3*termLength; + //System.err.println("Max key size, for terms up "+termLength+" is "+ maxKeyWrittenSize); + ZERO_BUFFER = new byte[maxKeyWrittenSize]; + } + + public int getSize() { + + return maxKeyWrittenSize; + } + + public Text newInstance() { + return new FixedSizeText(); + } + + public static class Tester + { + static String makeStringOfLength(char c, int length) + { + StringBuilder s = new StringBuilder(); + for(int i = 0; i factory = new FixedSizeTextFactory(length); + int bytes = factory.getSize(); + + String[] testStrings = { + "", "a", "abat", + "1234567890", "123456789001234567890", + "\u0290\u0290", + makeStringOfLength('\u0290', length), + makeStringOfLength('\u0690', length) + }; + for (String s : testStrings) + { + byte[] b = getBytes(factory, s); + assertEquals(b.length, bytes); + assertEquals(s, getString(factory, b)); + } + } + + static String getString(FixedSizeWriteableFactory factory, byte[] b) throws Exception + { + ByteArrayInputStream buffer = new ByteArrayInputStream(b); + DataInputStream dis = new DataInputStream(buffer); + Text t = factory.newInstance(); + t.readFields(dis); + return t.toString(); + } + + static byte[] getBytes(FixedSizeWriteableFactory factory, String s) throws Exception + { + ByteArrayOutputStream buffer = new ByteArrayOutputStream(); + DataOutputStream dos = new DataOutputStream(buffer); + Text t = factory.newInstance(); + t.set(s); + t.write(dos); + return buffer.toByteArray(); + } + } +} diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/seralization/FixedSizeWriteableFactory.java src/uk/ac/gla/terrier/structures/seralization/FixedSizeWriteableFactory.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/seralization/FixedSizeWriteableFactory.java 1970-01-01 01:00:00.000000000 +0100 +++ src/uk/ac/gla/terrier/structures/seralization/FixedSizeWriteableFactory.java 2009-03-03 14:34:49.000000000 +0000 @@ -0,0 +1,8 @@ +package uk.ac.gla.terrier.structures.seralization; + + + +public interface FixedSizeWriteableFactory extends WriteableFactory +{ + public int getSize(); +} \ No newline at end of file diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/seralization/WriteableFactory.java src/uk/ac/gla/terrier/structures/seralization/WriteableFactory.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/structures/seralization/WriteableFactory.java 1970-01-01 01:00:00.000000000 +0100 +++ src/uk/ac/gla/terrier/structures/seralization/WriteableFactory.java 2009-03-03 14:34:49.000000000 +0000 @@ -0,0 +1,6 @@ +package uk.ac.gla.terrier.structures.seralization; + +public interface WriteableFactory +{ + public T newInstance(); +} \ No newline at end of file diff -x parser -x tests -x upgrading -x .classpath -x .project -x CVS -x html -urN ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/utility/ApplicationSetup.java src/uk/ac/gla/terrier/utility/ApplicationSetup.java --- ../../../CURRENT_HEAD/terrier/src/uk/ac/gla/terrier/utility/ApplicationSetup.java 2009-01-28 20:17:02.000000000 +0000 +++ src/uk/ac/gla/terrier/utility/ApplicationSetup.java 2009-03-03 14:34:49.000000000 +0000 @@ -230,7 +230,7 @@ * property is if.suffix and by default * the value of this property is .if */ - public static String IFSUFFIX; + //public static String IFSUFFIX; /** * The suffix of the file that contains the @@ -238,7 +238,7 @@ * lexicon.suffix and by default * the value of this property is .lex */ - public static String LEXICONSUFFIX; + //public static String LEXICONSUFFIX; /** * The suffix of the file that contains the @@ -255,11 +255,11 @@ * property is lexicon.index.suffix and * by default its value is .lexid. */ - public static String LEXICON_INDEX_SUFFIX; + //public static String LEXICON_INDEX_SUFFIX; /** The suffix of the lexicon hash file. Corresponding property * is lexicon.hash.suffix, default is ".lexhash". */ - public static String LEXICON_HASH_SUFFIX; + //public static String LEXICON_HASH_SUFFIX; /** @@ -289,7 +289,7 @@ * lexicon files. It corresponds to the property * merge.prefix and the default value is MRG_. */ - public static String MERGE_PREFIX; + //public static String MERGE_PREFIX; /** * A progressive number which is assigned to the @@ -299,7 +299,7 @@ * the property merge.temp.number and the default value * is 100000 */ - public static int MERGE_TEMP_NUMBER; + //public static int MERGE_TEMP_NUMBER; /** * The number of documents to be processed as a group during indexing. @@ -308,7 +308,7 @@ * create a single lexicon. It corresponds to the property * bundle.size and the default value is 2000. */ - public static int BUNDLE_SIZE; + //public static int BUNDLE_SIZE; /** * The number of bytes used to store a term. Corresponds to MAX_TERM_LENGTH @@ -346,16 +346,16 @@ public static String TERRIER_INDEX_PREFIX; /** The filename of the inverted file.*/ - public static String INVERTED_FILENAME; + //public static String INVERTED_FILENAME; /** The filename of the direct file.*/ public static String DIRECT_FILENAME; /** The filename of the document index.*/ public static String DOCUMENT_INDEX_FILENAME; /** The filename of the lexicon file.*/ - public static String LEXICON_FILENAME; + //public static String LEXICON_FILENAME; /** The filename of the lexicon index file.*/ - public static String LEXICON_INDEX_FILENAME; + //public static String LEXICON_INDEX_FILENAME; /** The filename of the log (statistics) file.*/ public static String LOG_FILENAME; @@ -532,10 +532,10 @@ //The following properties specify the filenames and suffixes COLLECTION_SPEC = makeAbsolute(getProperty("collection.spec", "collection.spec"), TERRIER_ETC); - IFSUFFIX = getProperty("if.suffix", ".if"); - LEXICONSUFFIX = getProperty("lexicon.suffix", ".lex"); - LEXICON_INDEX_SUFFIX = getProperty("lexicon.index.suffix", ".lexid"); - LEXICON_HASH_SUFFIX = getProperty("lexicon.hash.suffix",".lexhash"); + //IFSUFFIX = getProperty("if.suffix", ".if"); + //LEXICONSUFFIX = getProperty("lexicon.suffix", ".lex"); + //LEXICON_INDEX_SUFFIX = getProperty("lexicon.index.suffix", ".lexid"); + //LEXICON_HASH_SUFFIX = getProperty("lexicon.hash.suffix",".lexhash"); DOC_INDEX_SUFFIX = getProperty("doc.index.suffix", ".docid"); LOG_SUFFIX = getProperty("log.suffix", ".log"); DF_SUFFIX = getProperty("df.suffix", ".df"); @@ -545,8 +545,8 @@ //documents. The prefix mergepref and and the number prog.nr //specify the names of the temporary lexicon created //during creating a global lexicon. - MERGE_PREFIX = getProperty("merge.prefix", "MRG_"); - MERGE_TEMP_NUMBER = Integer.parseInt(getProperty("merge.temp.number", "100000")); + //MERGE_PREFIX = getProperty("merge.prefix", "MRG_"); + //MERGE_TEMP_NUMBER = Integer.parseInt(getProperty("merge.temp.number", "100000")); //if a document is empty, that is it does not contain any terms, //we have the option to add it to the index, or not. By default, @@ -555,7 +555,7 @@ //During the indexing process, we process and create temporary structures //for bundle.size files. - BUNDLE_SIZE = Integer.parseInt(getProperty("bundle.size", "2000")); + //BUNDLE_SIZE = Integer.parseInt(getProperty("bundle.size", "2000")); //the maximum size of a term (string) MAX_TERM_LENGTH = Integer.parseInt(getProperty("max.term.length", "20")); @@ -693,11 +693,11 @@ */ public static void setupFilenames() { String filenameTemplate = TERRIER_INDEX_PATH + FILE_SEPARATOR + TERRIER_INDEX_PREFIX; - INVERTED_FILENAME =filenameTemplate + IFSUFFIX; + //INVERTED_FILENAME =filenameTemplate + IFSUFFIX; DIRECT_FILENAME = filenameTemplate + DF_SUFFIX; DOCUMENT_INDEX_FILENAME = filenameTemplate + DOC_INDEX_SUFFIX; - LEXICON_FILENAME = filenameTemplate + LEXICONSUFFIX; - LEXICON_INDEX_FILENAME = filenameTemplate + LEXICON_INDEX_SUFFIX; + //LEXICON_FILENAME = filenameTemplate + LEXICONSUFFIX; + //LEXICON_INDEX_FILENAME = filenameTemplate + LEXICON_INDEX_SUFFIX; LOG_FILENAME = filenameTemplate + LOG_SUFFIX; }