Commit d5ed3331 authored by Panagiotis Papadakos's avatar Panagiotis Papadakos
Browse files

[Comments-Typos] Update all comments and also fix some typos. I think themis...

[Comments-Typos] Update all comments and also fix some typos. I think themis now can go public to hy463
parent 5898c76a
......@@ -30,6 +30,8 @@ import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
/**
* Class that holds all textual information read from an entry of the collection
* It is used by the S2JSONEntryReader
*
* @author Panagiotis Papadakos <papadako at ics.forth.gr>
*/
......
......@@ -24,10 +24,10 @@
*/
package gr.csd.uoc.hy463.themis.indexer;
import gr.csd.uoc.hy463.themis.indexer.model.DocInfoEssential;
import gr.csd.uoc.hy463.themis.indexer.model.DocInfoFull;
import gr.csd.uoc.hy463.themis.config.Config;
import gr.csd.uoc.hy463.themis.indexer.indexes.Index;
import gr.csd.uoc.hy463.themis.indexer.model.DocInfoEssential;
import gr.csd.uoc.hy463.themis.indexer.model.DocInfoFull;
import gr.csd.uoc.hy463.themis.utils.Pair;
import java.io.File;
import java.io.IOException;
......@@ -40,8 +40,14 @@ import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
/**
* Our basic indexer class responsible for indexing a collection and for holding
* all relevant information during querying
* Our basic indexer class. This class is responsible for two tasks:
*
* a) Create the appropriate indexes given a specific directory with files (in
* our case the Semantic Scholar collection)
*
* b) Given a path load the indexes (if they exist) and provide information
* about the indexed data, that can be used for implementing any kind of
* retrieval models
*
* When the indexes have been created we should have three files, as documented
* in Index.java
......@@ -52,7 +58,7 @@ public class Indexer {
private static final Logger __LOGGER__ = LogManager.getLogger(Indexer.class);
private Config __CONFIG__; // configuration options
// The path of index
// The file path of indexes
private String __INDEX_PATH__ = null;
// Filenames of indexes
private String __VOCABULARY_FILENAME__ = null;
......@@ -61,19 +67,19 @@ public class Indexer {
private String __META_FILENAME__ = null;
// Vocabulary should be stored in memory for querying! This is crucial
// since we want to keep things fast! This is done thouh load().
// since we want to keep things fast! This is done through load().
// For this project use a HashMap instead of a trie
private HashMap<String, Pair<Integer, Long>> __VOCABULARY__ = null;
private RandomAccessFile __POSTINGS__ = null;
private RandomAccessFile __DOCUMENTS__ = null;
// This map holds any information related with the indexed collection
// and should be serialized when finishing the index process. Such
// and should be serialized when the index process has finished. Such
// information could be the avgDL for the Okapi-BM25 implementation,
// a timestamp of when the indexing process finished, the path of the indexed
// collection, and whatever else you // might want. Before querying we have
// to load the serialized file
private Map<String, String> __META__ = null;
// collection, and whatever else you might want. But make sure that before
// querying the serialized file is loaded
private Map<String, String> __META_INDEX_INFO__ = null;
/**
* Default constructor. Creates also a config instance
......@@ -87,7 +93,7 @@ public class Indexer {
}
/**
* Constructor that gets a Config instance
* Constructor that gets a current Config instance
*
* @param config
* @throws IOException
......@@ -109,13 +115,13 @@ public class Indexer {
}
/**
* Is this a valid Index? Checks that the index path + all *.idx files exist
* Checks that the index path + all *.idx files exist
*
* Method that checks if we have all appropriate files
*
* @return
*/
public boolean isValid() {
public boolean hasIndex() {
// Check if path exists
File file = new File(__INDEX_PATH__);
if (!file.exists() || !file.isDirectory()) {
......@@ -148,7 +154,10 @@ public class Indexer {
* to the themis.config file then we have to dump all data read up to now to
* a partial index and continue with a new index. After creating all partial
* indexes then we have to merge them to create the final index that will be
* stored
* stored in the file path.
*
* Can also be modified to use the MAX_MEMORY usage parameter given in
* themis.conf for brave hearts!
*
* @param path
* @return
......@@ -166,28 +175,29 @@ public class Indexer {
// for each scientific article in file
for (int article = 0;; article++) {
// Extract all textual info
// if indexed articles for this index less than config.getPartialIndexSize
// store all information to approapriate structures in memory to Index class
// else dump to files in appropriate directory id and increase partialIndexes
// if indexed articles for this index less than
// config.getPartialIndexSize store all information to
// approapriate structures in memory to Index class else dump
// to files in appropriate directory id and increase partialIndexes
if (article == __CONFIG__.getPartialIndexSize()) {
// Increase partial indexes and dump files to appropriate directory
partialIndexes++;
index.setID(partialIndexes);
index.dump(); // dump partial index
index.dump(); // dump partial index to appropriate subdirectory
}
}
}
// Now we have finished creating the partial indexes
// So we have to merge them
// So we have to merge them (call merge())
return false;
}
/**
* Method that merges two partial indexes and creates a new index with ID
* nextID, which is either a new partial index or the final index if we have
* finished merging
* finished merging (i.e., if nextID = 0)
*
* @param id
* @param id
......@@ -203,6 +213,7 @@ public class Indexer {
// the df and append the postings and documents of both
// Continue with the next lexicographically shortest word
// Dump the new index and delete the old partial indexes
// If nextID = 0 (i.e., we have finished merging partial indexes, store
// all idx files to INDEX_PATH
}
......@@ -211,6 +222,8 @@ public class Indexer {
* Method that indexes the collection that is given in the themis.config
* file
*
* Used for the task of indexing!
*
* @return
* @throws IOException
*/
......@@ -228,11 +241,13 @@ public class Indexer {
* Method responsible for loading vocabulary file to memory and also opening
* RAF files to postings and documents, ready to seek
*
* Used for the task of querying!
*
* @return
* @throws IOException
*/
public boolean load() throws IOException {
if (!isValid()) {
if (!hasIndex()) {
__LOGGER__.error("Index is not constructed correctly!");
return false;
}
......@@ -245,14 +260,39 @@ public class Indexer {
/**
* Basic method for querying functionality. Given the list of terms in the
* query, returns a List of Lists of DocInfoEssential objects, where each
* list of DocInfoEssential objects corresponds to a specific term of the
* query. A DocInfoEssential, should hold all needed information for
* implementing a retrieval model, like VSM, Okapi-BM25, etc.
* list of DocInfoEssential objects holds where each list of
* DocInfoEssential objects holds the DocInfoEssential representation of the
* docs that the corresponding term of the query appears in. A
* DocInfoEssential, should hold all needed information for implementing a
* retrieval model, like VSM, Okapi-BM25, etc. This is more memory efficient
* than holding getDocInfoFullTerms objects
*
* @param terms
* @return
*/
public List<List<DocInfoEssential>> getDocInfoEssentialForTerms(List<String> terms) {
// If indexes are not loaded
if (!loaded()) {
return null;
} else {
// to implement
return null;
}
}
/**
* Basic method for querying functionality. Given the list of terms in the
* query, returns a List of Lists of DocInfoFull objects, where each list of
* DocInfoFull objects holds the DocInfoFull representation of the docs that
* the corresponding term of the query appears in (i.e., the whole
* information). Not memory efficient though...
*
* Useful when we want to return the title, authors, etc.
*
* @param terms
* @return
*/
public List<List<DocInfoEssential>> getDocInfosForTerms(List<String> terms) {
public List<List<DocInfoFull>> getDocInfoFullTerms(List<String> terms) {
// If indexes are not loaded
if (!loaded()) {
return null;
......@@ -263,9 +303,12 @@ public class Indexer {
}
/**
* This is a methods that give a list of docs in essential representation,
* returns a list with the full description of docs stored in the Documents
* File
* This is a method that given a list of docs in the essential
* representation, returns a list with the full description of docs stored
* in the Documents File. This method is needed when we want to return the
* full information of a list of documents. Could be useful if we support
* pagination to the results (i.e. provide the full results of ten
* documents)
*
* @param docs
* @return
......
......@@ -31,7 +31,7 @@ import org.apache.logging.log4j.Logger;
/**
* This class holds all information related to a specific (partial or not) index
* in memory. It also knows how to store this information in files
* in memory. It also knows how to store this information to files
*
* @author Panagiotis Papadakos (papadako@ics.forth.gr)
*/
......@@ -85,30 +85,34 @@ public class Index {
* 1) VOCABULARY FILE => vocabulary.idx (Normal Sequential file)
*
* This is a normal sequential file where we write in lexicographic order
* the following entries separated with space: | TERM (a term of the
* the following entries separated by space: | TERM (a term of the
* vocabulary) | DF document frequency of this term | POINTER_TO_POSTING
* (the offset in the posting.idx) |
* (the offset in the posting.idx, this is a long number) |
*
* =========================================================================
* 2) POSTING FILE => posting.idx (Random Access File)
*
* For each entry it stores: |DOCUMENT_ID (40 ASCII chars - 40 bytes)| |TF
* (int 4 bytes) | POINTER_TO_DOCUMENT_FILE (long 4 bytes)
* For each entry it stores: | DOCUMENT_ID (40 ASCII chars => 40 bytes) | TF
* (int => 4 bytes) | POINTER_TO_DOCUMENT_FILE (long => 4 bytes)
*
* =========================================================================
* 3) DOCUMENTS FILE => documents.idx (Random Access File)
*
* For each entry it stores: | Title (variable bytes) | Author1,Author2,
* ...,Author_k (variable size) | Year (2 bytes short)| Journal Name
* (variable bytes) | The weight of Document (double - 8 bytes)| Length of
* Document (int - 4 bytes) | PageRank Score (double - 8 bytes => this will
* be used in the second phase of the project)
* ...,Author_k (variable size) | AuthorID1, AuthorID, ...,Author_IDk
* (variable size) | Year (2 bytes short)| Journal Name (variable bytes) |
* The weight (norm) of Document (double => 8 bytes)| Length of Document
* (int => 4 bytes) | PageRank Score (double => 8 bytes => this will be used
* in the second phase of the project)
*
*
* ==> IMPORTANT NOTES
*
* For strings that have a variable size, just add as an int (4 bytes)
* prefix storing the size in bytes of the string
* prefix storing the size in bytes of the string. Also make sure that you
* use the correct representation ASCII (1 byte) or UTF-8 (2 bytes). For
* example the doc id is a hexadecimal hash so there is no need for UTF
* encoding
*
* Authors are separated by a comma
*
......@@ -118,7 +122,7 @@ public class Index {
* collection by scanning the whole postings list
*
* For now add 0.0 for PageRank score (a team will be responsible for
* computing it in the second phase of the project
* computing it in the second phase of the project)
*
*
* @return
......
......@@ -31,18 +31,18 @@ import java.util.Map;
* This class holds any information we might want to communicate with the
* retrieval model we are implementing about a specific document
*
* Currently just holds the important things.
* This essential representation just holds the important things.
*
* Can also be extended by another class, DocInfoFull, that will be used
* whenever we want to get all information related with a document, i.e. the
* entry of a document in the Documents file
* Can also be extended by another class, like the DocInfoFull, that will be
* used whenever we want to get all information related with a document, i.e.
* all information in the entry of a document in the Documents file
*
* Since probably we are going to store in memory a lot of these objects, we
* have to be as memory efficient as we can. This implementation with a map is
* worst than just keeping all properties as primitives and private members but
* seems to be simpler to interact with
*
* ID and offset are set only in the constructor
* ID and offset in document file are set only in the constructor
*
* @author Panagiotis Papadakos <papadako at ics.forth.gr>
*/
......@@ -59,6 +59,12 @@ public class DocInfoEssential {
// The size of the hashmap is only 3.... since up to now we have 3 properties to hold
protected final Map<PROPERTY, Object> props = new HashMap<>(3);
/**
*
* @param id the id of a document
* @param offset the offset in the document file the contains all
* information for this document
*/
public DocInfoEssential(String id, long offset) {
this.id = id;
this.offset = offset;
......@@ -76,7 +82,8 @@ public class DocInfoEssential {
}
/**
* Return the value of the property
* Return the value of the property. Have to cast to appropriate value the
* result in your code!
*
* @param prop
* @return
......@@ -102,7 +109,7 @@ public class DocInfoEssential {
return false;
}
DocInfoEssential other = (DocInfoEssential) o;
return this.id == other.id;
return this.id.equals(other.id);
}
@Override
......
......@@ -26,7 +26,8 @@ package gr.csd.uoc.hy463.themis.indexer.model;
/**
* This class could be used when we want to get all information of a specific
* document, etc. title, authors, etc.
* document, etc. title, authors, etc. by reading the appropriate entry in the
* documents file
*
* @author Panagiotis Papadakos <papadako at ics.forth.gr>
*/
......@@ -36,21 +37,4 @@ public class DocInfoFull extends DocInfoEssential {
super(id, offset);
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
DocInfoEssential other = (DocInfoEssential) o;
return this.id == other.id;
}
@Override
public int hashCode() {
return id.hashCode();
}
}
......@@ -25,7 +25,9 @@
package gr.csd.uoc.hy463.themis.retrieval;
/**
* A query term can have some kind of weight!
* This class represents a query term. A query term can have some kind of
* weight! Might be useful for experimenting with different weights for terms
* (e.g., synonyms/antonyms)
*
* @author Panagiotis Papadakos <papadako at ics.forth.gr>
*/
......
......@@ -47,21 +47,27 @@ abstract class ARetrievalModel {
}
/**
* Method that evaluates the query and returns the whole ranked list of
* results.
* Method that evaluates the query and returns a ranked list of pairs of the
* whole relevant documents.
*
* If type PLAIN then the Object in the Pair is the id of the doc (String),
* if the type is ESSENTIAL the Object in the Pair is DocInfoEssential, and
* if the type is FULL then the Object in the Pair is DocInfoFull
* if the type is FULL then the Object in the Pair is DocInfoFull.
*
* The double is the score of the document as returned by the corresponding
* retrieval model.
*
* @param query
* The list must be in descending order according to the score
*
* @param query list of query terms
* @param type the type of object in the pair (PLAIN/ESSENTIAL/FULL)
* @return
*/
public abstract List<Pair<Object, Double>> getRankedResults(List<QueryTerm> query, RESULT_TYPE type);
/**
* Method that evaluates the query and returns the top-k ranked list of
* results.
* Method that evaluates the query and returns a list of pairs with the
* top-k ranked results.
*
* There are various policies to be faster when doing this if we do not want
* to compute the scores of all queries.
......@@ -75,13 +81,17 @@ abstract class ARetrievalModel {
* if the type is ESSENTIAL the Object in the Pair is DocInfoEssential, and
* if the type is FULL then the Object in the Pair is DocInfoFull
*
* The double is the score of the document as returned by the corresponding
* retrieval model.
*
* The list must be in descending order according to the score
*
* @param query
* @param type
* @param topk
* @param query list of query terms
* @param type the type of object in the pair (PLAIN/ESSENTIAL/FULL)
* @param topk a number (i.e. the top-10 results)
* @return
*/
public abstract List<Pair<String, Double>> getRankedResults(List<QueryTerm> query, RESULT_TYPE type, int topk);
public abstract List<Pair<Object, Double>> getRankedResults(List<QueryTerm> query, RESULT_TYPE type, int topk);
// We should also add some kind of paging and caching... but maybe in the future
}
......@@ -46,7 +46,7 @@ public class OkapiBM25 extends ARetrievalModel {
}
@Override
public List<Pair<String, Double>> getRankedResults(List<QueryTerm> query, RESULT_TYPE type, int topk) {
public List<Pair<Object, Double>> getRankedResults(List<QueryTerm> query, RESULT_TYPE type, int topk) {
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
}
......
......@@ -46,7 +46,7 @@ public class VSM extends ARetrievalModel {
}
@Override
public List<Pair<String, Double>> getRankedResults(List<QueryTerm> query, RESULT_TYPE type, int topk) {
public List<Pair<Object, Double>> getRankedResults(List<QueryTerm> query, RESULT_TYPE type, int topk) {
throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates.
}
......
......@@ -35,7 +35,8 @@ import gr.csd.uoc.hy463.themis.stemmer.english.EnglishStemmer;
import javax.swing.*;
/*
* Stemmer Class. A Singleton
* Stemmer Class. A Singleton class responsible for stemming tokes.
* This code is only for English!
*
*/
public class Stemmer {
......
......@@ -25,6 +25,8 @@
package gr.csd.uoc.hy463.themis.stemmer;
/**
* Class responsible for identifying stopwords
*
* @author Panagiotis Papadakos (papadako@ics.forth.gr)
*/
import java.io.BufferedReader;
......
......@@ -29,6 +29,8 @@
package gr.csd.uoc.hy463.themis.stemmer.english;
/**
* English Stemmer
*
* @author Panagiotis Papadakos (papadako@ics.forth.gr)
*/
public class EnglishStemmer {
......
......@@ -27,7 +27,7 @@ package gr.csd.uoc.hy463.themis.utils;
import java.io.Serializable;
/**
* FORTH-ICS
* Class that represents a Pair, i.e. a tuple of size two
*
* @param <L>
* @param <R>
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment