package org.kit.furia.index;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.PriorityQueue;
import org.ajmm.obsearch.Index;
import org.ajmm.obsearch.OB;
import org.ajmm.obsearch.Result;
import org.ajmm.obsearch.exception.AlreadyFrozenException;
import org.ajmm.obsearch.exception.IllegalIdException;
import org.ajmm.obsearch.exception.OBException;
import org.ajmm.obsearch.exception.OutOfRangeException;
import org.ajmm.obsearch.exception.UndefinedPivotsException;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DefaultSimilarity;
import org.apache.lucene.search.Hit;
import org.apache.lucene.search.HitCollector;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Searcher;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.similar.MoreLikeThis;
import org.apache.lucene.search.similar.MoreLikeThisQuery;
import org.apache.lucene.search.similar.SimilarityQueries;
import org.kit.furia.Document;
import org.kit.furia.IRIndex;
import org.kit.furia.ResultCandidate;
import org.kit.furia.Document.DocumentElement;
import org.kit.furia.exceptions.IRException;
import com.sleepycat.bind.tuple.TupleInput;
import com.sleepycat.bind.tuple.TupleOutput;
import com.sleepycat.je.DatabaseException;
/*
Furia-chan: An Open Source software license violation detector.
Copyright (C) 2007 Kyushu Institute of Technology
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see .
*/
/**
* AbstractIRIndex holds the basic functionality for an Information Retrieval
* system that works on OB objects (please see www.obsearch.net). By using a
* distance function d, we transform the queries in terms of the closest
* elements that are in the database, and once this transformation is performed,
* we utilize an information retrieval system (Apache's Lucene) to perform the
* matching.
* @author Arnoldo Jose Muller Molina
* @param
* The basic unit in which all the information is divided. In the
* case of natural language documents, this would be a word.
* @since 0
*/
public abstract class AbstractIRIndex < O extends OB > implements IRIndex < O > {
/**
* Logger.
*/
private static final Logger logger = Logger
.getLogger(AbstractIRIndex.class);
/**
* Lucene has the concepts of fields of a document. They are analogous to
* columns of a DB table (SQL). This enumeration lists the name of the
* fields our IRIndex will use.
*/
protected enum FieldName {
/*
* The words of the document (data to be stored but used by Lucene to
* actually compute the score)
*/
WORDS,
/* The name of the document */
DOC_NAME,
/* Cardinality of the multi-set of OB objects in a document */
MULTI_SET_SIZE,
/* Details of the doc word-freq */
DOC_DETAILS,
/* Cardinality of the set of OB objects in a document */
SET_SIZE,
}
/**
* This object is used to add elements to the index.
*/
protected IndexWriter indexWriter;
/**
* This object is used to read different data from the index.
*/
protected IndexReader indexReader;
/**
* This object is used to search the index;
*/
protected Searcher searcher;
/**
* At least the given naive mset score must be obtained to consider a term in the result.
*/
protected float mSetScoreThreshold = 0.32f;
/**
* At least the given naive set score must be obtained to consider a term in the result.
*/
protected float setScoreThreshold = 0.04f;
/**
* Tells whether or not the index is in validation mode.
*/
protected boolean validationMode = false;
/**
* The index where all the data will be stored.
*/
/**
* Creates a new IR index if none is available in the given path.
* @param dbFolder
* The folder in which Lucene's files will be stored
* @throws IOException
* If the given directory does not exist or if some other IO
* error occurs
*/
public AbstractIRIndex(File dbFolder) throws IOException {
indexWriter = new IndexWriter(dbFolder, new WhitespaceAnalyzer());
indexReader = IndexReader.open(dbFolder);
searcher = new IndexSearcher(indexReader);
}
public int delete(String documentName) throws IRException {
int res = 0;
try {
res = indexReader.deleteDocuments(new Term(FieldName.DOC_NAME
.toString(), documentName));
} catch (Exception e) {
throw new IRException(e);
}
return res;
}
/**
* Returns true if the document corresponding
* to x's name exists in the DB. This method is intended to be
* used in validation mode only.
* @param x
* @return true if the DB does not contain a document with name x.getName()
*/
public boolean shouldSkipDoc(Document x) throws IOException{
//TermDocs td = this.indexReader.termDocs(new Term(FieldName.DOC_NAME.toString(), x.getName()));
//org.apache.lucene.document.Document document = searcher.doc(td.doc());
return this.indexReader.docFreq(new Term(FieldName.DOC_NAME.toString(), x.getName()) )< 1;
}
/**
* Calculates the ResultCandidate between a normalized query and a Lucene document.
* @return A result candidate for the given document and normalized query.
*/
protected ResultCandidate calculateSimilarity( org.apache.lucene.document.Document document , Map < Integer, Integer > normalizedQuery, float score){
String docName = document.getField(
FieldName.DOC_NAME.toString()).stringValue();
TupleInput in = new TupleInput(document.getField(
FieldName.MULTI_SET_SIZE.toString()).binaryValue());
// size of the doc in the DB.
int docSizeMSet = in.readInt();
int docSizeSet = new TupleInput(document.getField(
FieldName.SET_SIZE.toString()).binaryValue())
.readInt();
int cx = 0;
int intersectionQueryMSet = 0;
int intersectionQuerySet = 0;
TupleInput freqs = new TupleInput(document.getField(
FieldName.DOC_DETAILS.toString()).binaryValue());
while (cx < docSizeSet) {
int wordId = freqs.readInt();
int frecuency = freqs.readInt();
Integer frequencyQuery = normalizedQuery.get(wordId);
if (frequencyQuery != null) {
intersectionQuerySet++;
intersectionQueryMSet += Math.min(frecuency,
frequencyQuery);
}
cx++;
}
ResultCandidate can = new ResultCandidate(docName, score,
intersectionQueryMSet, docSizeMSet,
intersectionQuerySet, docSizeSet);
return can;
}
private class FHitCollector extends HitCollector{
PriorityQueue < ResultCandidate > candidates;
Map < Integer, Integer > normalizedQuery;
private boolean found = false; /* to be used only in validation mode */
private String queryName; /* name of the query, only used in validation mode */
private ResultCandidate match; /* the document, only used in validation mode */
public FHitCollector(Map < Integer, Integer > normalizedQuery, String queryName){
this.candidates = new PriorityQueue < ResultCandidate >();
this.normalizedQuery = normalizedQuery;
this.queryName = queryName;
}
public ResultCandidate getMatch() {
return match;
}
public boolean isFound() {
return found;
}
public void collect(int doc, float score) {
org.apache.lucene.document.Document document = null;
try{
document = searcher.doc(doc);
}catch(IOException e){
logger.fatal("This is bad", e);
candidates =null;
normalizedQuery = null;
assert false;
}
ResultCandidate can = calculateSimilarity(document, normalizedQuery, score);
if(validationMode){
String docName = document.getField(
FieldName.DOC_NAME.toString()).stringValue();
if(docName.equals(queryName)){
found = true;
logger.debug("Collector: Found :)" + can.toString());
}
/* if(! shouldAddToTheFinalResult(can)){
candidates.add(can); // only in validation mode, we add the result so that we can analyze everything
}*/
}
if(shouldAddToTheFinalResult(can)){
candidates.add(can);
}
}
private boolean shouldAddToTheFinalResult(ResultCandidate can){
return can.getNaiveScoreMSet() >= mSetScoreThreshold && can.getNaiveScoreSet() >= setScoreThreshold;
}
public List getResults(){
List res = new LinkedList();
ResultCandidate cur;
int total = 0;
while (((cur = candidates.poll()) != null)) {
res.add(cur);
}
return res;
}
}
/**
* Returns the # of documents in this DB.
* @return
*/
public int getSize(){
return this.indexReader.numDocs();
}
protected List < ResultCandidate > processQueryResults(
Map < Integer, Integer > normalizedQuery, short n, Document query)
throws IRException {
List < ResultCandidate > res = new LinkedList < ResultCandidate >();
try {
// at this stage we have created a map that holds all the matched
// elements,
// the initial document that is in terms of the original objects
// (obfuscated fragments in the case of furia)
// is now transformed in terms of the database.
// we now generate a priority queue with the terms, in order to sort
// them and put the most
// relevant at the beginning of the query.
if (normalizedQuery.size() > 0) {
PriorityQueue < Word > sorted = createPriorityQueue(normalizedQuery);
Query q = createQuery(sorted);
FHitCollector collector = new FHitCollector(normalizedQuery, query.getName());
// now we just perform the search and return the results.
searcher.search(q,collector);
/*if(validationMode && ! collector.isFound()){
// we did not found the doc. Print the stats of the doc so we can understand what is going on.
TermDocs td = this.indexReader.termDocs(new Term(FieldName.DOC_NAME.toString(), query.getName()));
org.apache.lucene.document.Document document = null;
try{
document = searcher.doc(td.doc());
}catch(IOException e){
logger.fatal("This is bad", e);
throw new IRException(e);
}
logger.info("Forced: " + calculateSimilarity(document, normalizedQuery, -1.0f).toString());
}*/
return(collector.getResults());
}
}catch (IOException e) {
throw new IRException(e);
}
return res;
}
/**
* Receives a map with normalized OB_id -> freq and returns the closest
* elements to the original query
* @param normalizedQuery
* A map that contains OB_id -> freq
* @param n
* Return the top n results only.
* @return The top n candidates.
*/
/*protected List < ResultCandidate > processQueryResults(
Map < Integer, Integer > normalizedQuery, short n)
throws IRException {
try {
// at this stage we have created a map that holds all the matched
// elements,
// the initial document that is in terms of the original objects
// (obfuscated fragments in the case of furia)
// is now transformed in terms of the database.
// we now generate a priority queue with the terms, in order to sort
// them and put the most
// relevant at the beginning of the query.
List < ResultCandidate > res = new LinkedList < ResultCandidate >();
if (normalizedQuery.size() > 0) {
PriorityQueue < Word > sorted = createPriorityQueue(normalizedQuery);
Query q = createQuery(sorted);
// now we just perform the search and return the results.
Hits hits = searcher.search(q);
if(logger.isDebugEnabled()){
logger.info("Hits size: " + hits.length());
}
Iterator < Hit > it = hits.iterator();
int i = 0;
while (it.hasNext() && i < n) {
Hit h = it.next();
String docName = h.getDocument().getField(
FieldName.DOC_NAME.toString()).stringValue();
TupleInput in = new TupleInput(h.getDocument().getField(
FieldName.MULTI_SET_SIZE.toString()).binaryValue());
// size of the doc in the DB.
int docSizeMSet = in.readInt();
int docSizeSet = new TupleInput(h.getDocument().getField(
FieldName.SET_SIZE.toString()).binaryValue())
.readInt();
int cx = 0;
int intersectionQueryMSet = 0;
int intersectionQuerySet = 0;
TupleInput freqs = new TupleInput(h.getDocument().getField(
FieldName.DOC_DETAILS.toString()).binaryValue());
while (cx < docSizeSet) {
int wordId = freqs.readInt();
int frecuency = freqs.readInt();
Integer frequencyQuery = normalizedQuery.get(wordId);
if (frequencyQuery != null) {
intersectionQuerySet++;
intersectionQueryMSet += Math.min(frecuency,
frequencyQuery);
}
cx++;
}
res.add(new ResultCandidate(docName, h.getScore(),
intersectionQueryMSet, docSizeMSet,
intersectionQuerySet, docSizeSet));
i++;
}
}
return res;
} catch (IOException e) {
throw new IRException(e);
}
}*/
/**
* Repeats times times the given string, separates everything with a space
* @param x
* @param times
* @return
*/
private StringBuilder repeat(int x, int times) {
int i = 0;
assert times > 0;
StringBuilder res = new StringBuilder();
while (i < times) {
res.append(Integer.toString(x));
res.append(" ");
i++;
}
return res;
}
public void insert(Document < O > document) throws IRException {
// OBSearch index that we use for "stemming".
assert document.size() != 0;
try {
Index < O > index = getIndex();
Iterator < Document < O >.DocumentElement < O > > it = document
.iterator();
StringBuilder contents = new StringBuilder();
int i = 0;
long prevTime = System.currentTimeMillis();
// write the multi-set representation of the document into lucene.
TupleOutput mSetOut = new TupleOutput();
while (it.hasNext()) {
Document < O >.DocumentElement < O > elem = it.next();
Result res = index.insert(elem.getObject());
assert res.getStatus() == Result.Status.OK
|| res.getStatus() == Result.Status.EXISTS;
contents.append(repeat(res.getId(), elem.getCount()));
mSetOut.writeInt(res.getId());
mSetOut.writeInt(elem.getCount());
i++;
}
// now we just have to create the fields
Field contentsField = new Field(FieldName.WORDS.toString(),
contents.toString(), Field.Store.NO, Field.Index.TOKENIZED);
Field docName = new Field(FieldName.DOC_NAME.toString(), document
.getName(), Field.Store.YES, Field.Index.UN_TOKENIZED);
TupleOutput out = new TupleOutput();
out.writeInt(document.multiSetSize());
Field multiSetSize = new Field(FieldName.MULTI_SET_SIZE.toString(),
out.getBufferBytes(), Field.Store.YES);
Field docDetails = new Field(FieldName.DOC_DETAILS.toString(),
mSetOut.getBufferBytes(), Field.Store.YES);
TupleOutput out2 = new TupleOutput();
out2.writeInt(document.size());
Field setSize = new Field(FieldName.SET_SIZE.toString(), out2
.getBufferBytes(), Field.Store.YES);
org.apache.lucene.document.Document doc = new org.apache.lucene.document.Document();
doc.add(contentsField);
doc.add(docName);
doc.add(multiSetSize);
doc.add(setSize);
doc.add(docDetails);
indexWriter.addDocument(doc);
} catch (Exception e) {
throw new IRException(e);
}
}
public void freeze() throws IRException {
try {
logger.debug("Freezing index, OB objects: "
+ this.getIndex().databaseSize());
getIndex().freeze();
logger.debug("Optimizing Lucene");
indexWriter.optimize();
} catch (Exception e) {
logger.fatal("Error in freeze: ", e);
throw new IRException(e);
}
}
public void close() throws IRException {
try {
getIndex().close();
indexWriter.close();
indexReader.close();
} catch (Exception e) {
throw new IRException(e);
}
}
/*
private Query createQuery(PriorityQueue < Word > q) {
StringBuilder contents = new StringBuilder();
Word cur;
int total = 0;
while (((cur = q.poll()) != null)) {
total += cur.getDocFreq();
contents.append(repeat(cur.getId(), cur.getDocFreq()));
//contents.append(cur.getId());
//contents.append(" ");
}
MoreLikeThis yay = new MoreLikeThis(indexReader);
yay.setAnalyzer(new WhitespaceAnalyzer());
yay.setBoost(true);
yay.setMaxNumTokensParsed(total);
yay.setMaxQueryTerms(0);
yay.setFieldNames(new String[] { FieldName.WORDS.toString() });
Query res = null;
try{
res = yay.like(new ByteArrayInputStream(contents.toString().getBytes()));
}catch(IOException e){
logger.fatal("Error while creating query", e);
}
return res;
//MoreLikeThisQuery res = new MoreLikeThisQuery(contents.toString(), new String[]{ FieldName.WORDS.toString() }, new WhitespaceAnalyzer() );
//res.setMaxQueryTerms(100);
//res.setPercentTermsToMatch(0.3f);
//return res;
//Query res = null;
//try{
// res = SimilarityQueries.formSimilarQuery(contents.toString(), new WhitespaceAnalyzer(), FieldName.WORDS.toString(), null);
//}catch(IOException e){
// logger.fatal("Problem while creating Query", e);
//}
//return res;
}*/
/**
* Create the "more like" query from a PriorityQueue. (This code was
* borrowed from lucene-contrib)
*/
private Query createQuery(PriorityQueue < Word > q) {
BooleanQuery query = new BooleanQuery(true); // aqui va el true, da
// mejores resultados.
// query.setUseScorer14(true); // no effect
query.setAllowDocsOutOfOrder(true);
//query.setMinimumNumberShouldMatch(100);
BooleanQuery.setMaxClauseCount(q.size());
boolean boost = true;
Word cur;
int qterms = 0;
// float bestScore = 0;
float bestScore = 0;
//while (((cur = q.poll()) != null) && qterms < 100) {
while (((cur = q.poll()) != null)) {
TermQuery tq = new TermQuery(new Term(FieldName.WORDS.toString(),
cur.getId().toString()));
//if (boost) { if (qterms == 0) { bestScore = cur.getScore(); }
//float myScore = cur.getScore(); tq.setBoost(myScore / bestScore); }
//tq.setBoost( (float)cur.getDocFreq()/ (float)q.size());
query.add(tq, BooleanClause.Occur.SHOULD);
qterms++;
}
return query;
}
/**
* Create a PriorityQueue from a word->tf map. (This code was borrowed from
* lucene-contrib)
* @param words
* a map of words keyed on the word(String) with Int objects
* as the values.
* @return A priority queue ordered by the most important word.
*/
protected PriorityQueue < Word > createPriorityQueue(
Map < Integer, Integer > words) throws IOException {
// have collected all words in doc and their freqs
assert words.size() > 0;
int numDocs = this.indexReader.numDocs();
PriorityQueue < Word > res = new PriorityQueue < Word >(words.size());
Similarity similarity = new DefaultSimilarity();
Iterator < Integer > it = words.keySet().iterator();
while (it.hasNext()) { // for every word
Integer wordId = it.next();
Integer repetition = words.get(wordId);
int tf = repetition; // term freq in the source doc
int docFreq = indexReader.docFreq(new Term(FieldName.WORDS
.toString(), wordId.toString()));
if (docFreq == 0) {
continue; // index update problem?
}
float idf = similarity.idf(docFreq, numDocs);
float score = tf * idf;
// only really need 1st 3 entries, other ones are for
// troubleshooting
res.add(new Word(wordId, score, idf, docFreq, tf));
}
return res;
}
/**
* Represents an OB object.
*/
protected class Word implements Comparable < Word > {
/**
* Id of the word.
*/
private Integer id;
/**
* Overall score.
*/
private float score;
/**
* idf.
*/
private float idf;
/**
* # of docs where this word appears.
*/
private int docFreq;
/**
* tf.
*/
private int tf;
public Word(Integer id, float score, float idf, int docFreq, int tf) {
super();
this.id = id;
this.score = score;
this.idf = idf;
this.docFreq = docFreq;
this.tf = tf;
}
public Integer getId() {
return id;
}
public float getScore() {
return score;
}
public float getIdf() {
return idf;
}
public int getDocFreq() {
return docFreq;
}
public int getTf() {
return tf;
}
public int compareTo(Word w) {
int res = 0;
if (score < w.score) {
res = -1;
} else if (score > w.score) {
res = 1;
}// else they are equal
return res * -1;// invert the result
}
}
public float getMSetScoreThreshold() {
return mSetScoreThreshold;
}
public void setMSetScoreThreshold(float setScoreThreshold) {
mSetScoreThreshold = setScoreThreshold;
}
public float getSetScoreThreshold() {
return setScoreThreshold;
}
public void setSetScoreThreshold(float setScoreThreshold) {
this.setScoreThreshold = setScoreThreshold;
}
public boolean isValidationMode() {
return validationMode;
}
public void setValidationMode(boolean validationMode) {
this.validationMode = validationMode;
}
}