package org.kit.furia; import org.ajmm.obsearch.OB; import java.io.IOException; import java.util.List; import org.ajmm.obsearch.Index; import org.ajmm.obsearch.exception.AlreadyFrozenException; import org.ajmm.obsearch.exception.IllegalIdException; import org.ajmm.obsearch.exception.OBException; import org.ajmm.obsearch.exception.OutOfRangeException; import org.ajmm.obsearch.exception.UndefinedPivotsException; import org.apache.lucene.index.CorruptIndexException; import org.kit.furia.exceptions.IRException; import com.sleepycat.je.DatabaseException; /* Furia-chan: An Open Source software license violation detector. Copyright (C) 2007 Kyushu Institute of Technology This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ /** * IRIndex holds the basic functionality for an Information Retrieval system * that works on OB objects (please see obsearch.berlios.de). By using a * distance function d, we transform the queries in terms of the closest * elements that are in the database, and once this transformation is performed, * we utilize an information retrieval system to perform the matching. Because * our documents are multi-sets, the distribution of OB objects inside a * document is taken into account. So, instead of matching a huge syntax tree of * for example, music, we cut a song into pieces, match the pieces and then the * overall finger-print of the multi-set of OB objects is matched. * @author Arnoldo Jose Muller Molina * @since 0 */ public interface IRIndex < O extends OB > { /** * Inserts a new document into the database. * @param document * The document to be inserted. * @throws IRException * If something goes wrong with the IR engine or with * OBSearch. */ void insert(Document < O > document) throws IRException; /** * Deletes the given string document from the database. If more than one * documents have the same name, all the documents will be erased. * @return The number of documents deleted. * @throws IRException * If something goes wrong with the IR engine or with * OBSearch. */ int delete(String documentName) throws IRException; /** * Returns the underlying OBSearch index. * @return the underlying OBSearch index. */ Index < O > getIndex(); /** * Freezes the index. From this point data can be inserted, searched and * deleted. The index might deteriorate at some point so every once in a * while it is a good idea to rebuild the index. This method will also * @throws IRException * If something goes wrong with the IR engine or with * OBSearch. */ void freeze() throws IRException; /** * Closes the databases. You *should* close the databases after using an * IRIndex. * @throws IRException * If something goes wrong with the IR engine or with * OBSearch. */ void close() throws IRException; /** * Returns the number of documents stored in this index. * @return the number of documents stored in this index. */ int getSize(); /** * Returns true if the document corresponding to x's name exists in the DB. * This method is intended to be used in validation mode only. * @param x * @return true if the DB does not contain a document with name x.getName() */ boolean shouldSkipDoc(Document x) throws IOException; /** * The M-set score threshold is the minimum naive score for multi-sets * that the index will accept. * @return Returns the current M-set score threshold. */ float getMSetScoreThreshold(); /** * The M-set score threshold is the minimum naive score for multi-sets * that the index will accept. * @param setScoreThreshold the new threshold */ void setMSetScoreThreshold(float setScoreThreshold); /** * * The Set score threshold is the minimum naive score for Sets * that the index will accept. * @return Returns the current Set score threshold. */ float getSetScoreThreshold(); /** * The Set score threshold is the minimum naive score for Sets * that the index will accept. * @param setScoreThreshold the new threshold */ void setSetScoreThreshold(float setScoreThreshold); /** * Returns the count different words that * are used by the documents indexed. * @return the count different words that * are used by the documents indexed. */ int getWordsSize() throws DatabaseException; /** * Tells whether or not the index is in validation mode. * In validation mode we assume that documents with the same name are equal. * This helps us to add additional statistics on the performance of the scoring technique. * @return true if this index is in validation mode. */ boolean isValidationMode(); /** * Sets whether or not the index is in validation mode. * In validation mode we assume that documents with the same name are equal. * This helps us to add additional statistics on the performance of the scoring technique. * @param validationMode The new validation mode. * */ void setValidationMode(boolean validationMode); }