Commit 7265d7a0 authored by Panagiotis Papadakos's avatar Panagiotis Papadakos
Browse files

[Index] I think I have finished with the basic structure of the index. I have...

[Index] I think I have finished with the basic structure of the index. I have to check again the comments so that I get no complaints from the students :)
parent 604115ca
/*
* themis - A fair search engine for scientific articles
*
* Currently over the Semantic Scholar Open Research Corpus
* http://s2-public-api-prod.us-west-2.elasticbeanstalk.com/corpus/
*
* Collaborative work with the undergraduate/graduate students of
* Information Retrieval Systems (hy463) course
* Spring Semester 2020
*
* -- Writing code during COVID-19 pandemic times :-( --
*
* Aiming to participate in TREC 2020 Fair Ranking Track
* https://fair-trec.github.io/
*
* Computer Science Department http://www.csd.uoc.gr
* University of Crete
* Greece
*
* LICENCE: TO BE ADDED
*
* Copyright 2020
*
*/
package gr.csd.uoc.hy463.themis.indexer;
import gr.csd.uoc.hy463.themis.indexer.model.DocInfoEssential;
import gr.csd.uoc.hy463.themis.indexer.model.DocInfoFull;
import gr.csd.uoc.hy463.themis.config.Config;
import gr.csd.uoc.hy463.themis.indexer.indexes.Index;
import gr.csd.uoc.hy463.themis.utils.Pair;
import java.io.File;
import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
/**
* Our basic indexer class responsible for indexing a collection and for holding
* all relevant information during querying
*
* When the indexes have been created we should have three files, as documented
* in Index.java
*
* @author Panagiotis Papadakos (papadako@ics.forth.gr)
*/
public class Indexer {
private static final Logger __LOGGER__ = LogManager.getLogger(Indexer.class);
private Config __CONFIG__; // configuration options
// The path of index
private String __INDEX_PATH__ = null;
// Filenames of indexes
private String __VOCABULARY_FILENAME__ = null;
private String __POSTINGS_FILENAME__ = null;
private String __DOCUMENTS_FILENAME__ = null;
private String __META_FILENAME__ = null;
// Vocabulary should be stored in memory for querying! This is crucial
// since we want to keep things fast! This is done thouh load().
// For this project use a HashMap instead of a trie
private HashMap<String, Pair<Integer, Long>> __VOCABULARY__ = null;
private RandomAccessFile __POSTINGS__ = null;
private RandomAccessFile __DOCUMENTS__ = null;
// This map holds any information related with the indexed collection
// and should be serialized when finishing the index process. Such
// information could be the avgDL for the Okapi-BM25 implementation,
// a timestamp of when the indexing process finished, the path of the indexed
// collection, and whatever else you // might want. Before querying we have
// to load the serialized file
private Map<String, String> __META__ = null;
/**
* Default constructor. Creates also a config instance
*
* @throws IOException
* @throws ClassNotFoundException
*/
public Indexer() throws IOException, ClassNotFoundException {
__CONFIG__ = new Config(); // reads info from themis.config file
init();
}
/**
* Constructor that gets a Config instance
*
* @param config
* @throws IOException
* @throws ClassNotFoundException
*/
public Indexer(Config config) throws IOException, ClassNotFoundException {
this.__CONFIG__ = config; // reads info from themis.config file
init();
}
/**
* Initialize things
*/
private void init() {
__VOCABULARY_FILENAME__ = __CONFIG__.getVocabularyFileName();
__POSTINGS_FILENAME__ = __CONFIG__.getPostingsFileName();
__DOCUMENTS_FILENAME__ = __CONFIG__.getDocumentsFileName();
__INDEX_PATH__ = __CONFIG__.getIndexPath();
}
/**
* Is this a valid Index? Checks that the index path + all *.idx files exist
*
* Method that checks if we have all appropriate files
*
* @return
*/
public boolean isValid() {
// Check if path exists
File file = new File(__INDEX_PATH__);
if (!file.exists() || !file.isDirectory()) {
__LOGGER__.error(__INDEX_PATH__ + "directory does not exist!");
return false;
}
// Check if index files exist
file = new File(__INDEX_PATH__ + __VOCABULARY_FILENAME__);
if (!file.exists() || file.isDirectory()) {
__LOGGER__.error(__VOCABULARY_FILENAME__ + "vocabulary file does not exist in " + __INDEX_PATH__);
return false;
}
file = new File(__INDEX_PATH__ + __POSTINGS_FILENAME__);
if (!file.exists() || file.isDirectory()) {
__LOGGER__.error(__POSTINGS_FILENAME__ + " posting binary file does not exist in " + __INDEX_PATH__);
return false;
}
file = new File(__INDEX_PATH__ + __DOCUMENTS_FILENAME__);
if (!file.exists() || file.isDirectory()) {
__LOGGER__.error(__DOCUMENTS_FILENAME__ + "documents binary file does not exist in " + __INDEX_PATH__);
return false;
}
return true;
}
/**
* Method responsible for indexing a directory of files
*
* If the number of files is larger than the PARTIAL_INDEX_MAX_DOCS_SIZE set
* to the themis.config file then we have to dump all data read up to now to
* a partial index and continue with a new index. After creating all partial
* indexes then we have to merge them to create the final index that will be
* stored
*
* @param path
* @return
* @throws IOException
*/
public boolean index(String path) throws IOException {
Index index = new Index(__CONFIG__);
int partialIndexes = 0;
// Holds all files in path
List<String> files = new ArrayList<>();
// for each file in path
for (String file : files) {
// for each scientific article in file
for (int article = 0;; article++) {
// Extract all textual info
// if indexed articles for this index less than config.getPartialIndexSize
// store all information to approapriate structures in memory to Index class
// else dump to files in appropriate directory id and increase partialIndexes
if (article == __CONFIG__.getPartialIndexSize()) {
// Increase partial indexes and dump files to appropriate directory
partialIndexes++;
index.setID(partialIndexes);
index.dump(); // dump partial index
}
}
}
// Now we have finished creating the partial indexes
// So we have to merge them
return false;
}
/**
* Method that merges two partial indexes and creates a new index with ID
* nextID, which is either a new partial index or the final index if we have
* finished merging
*
* @param id
* @param id
* @param nextID
* @return
*/
private void merge(int partialID1, int partialID2, boolean nextID) {
// Read vocabulary files line by line in corresponding dirs
// and check which is the shortest lexicographically.
// Read the corresponding entries in the postings and documents file
// and append accordingly the new ones
// If both partial indexes contain the same word, them we have to update
// the df and append the postings and documents of both
// Continue with the next lexicographically shortest word
// Dump the new index and delete the old partial indexes
// If nextID = 0 (i.e., we have finished merging partial indexes, store
// all idx files to INDEX_PATH
}
/**
* Method that indexes the collection that is given in the themis.config
* file
*
* @return
* @throws IOException
*/
public boolean index() throws IOException {
String collectionPath = __CONFIG__.getDatasetPath();
if (collectionPath != null) {
return index(collectionPath);
} else {
__LOGGER__.error("DATASET_PATH not set in themis.config!");
return false;
}
}
/**
* Method responsible for loading vocabulary file to memory and also opening
* RAF files to postings and documents, ready to seek
*
* @return
* @throws IOException
*/
public boolean load() throws IOException {
if (!isValid()) {
__LOGGER__.error("Index is not constructed correctly!");
return false;
}
// Else load vocabulary file in memory in a HashMap and open
// indexes postings and documents RAF files
return false;
}
/**
* Basic method for querying functionality. Given the list of terms in the
* query, returns a List of Lists of DocInfoEssential objects, where each
* list of DocInfoEssential objects corresponds to a specific term of the
* query. A DocInfoEssential, should hold all needed information for
* implementing a retrieval model, like VSM, Okapi-BM25, etc.
*
* @param terms
* @return
*/
public List<List<DocInfoEssential>> getDocInfosForTerms(List<String> terms) {
// If indexes are not loaded
if (!loaded()) {
return null;
} else {
// to implement
return null;
}
}
/**
* This is a methods that give a list of docs in essential representation,
* returns a list with the full description of docs stored in the Documents
* File
*
* @param docs
* @return
*/
public List<DocInfoFull> getDocDescription(List<DocInfoEssential> docs) {
// If indexes are not loaded
if (!loaded()) {
return null;
} else {
// to implement
return null;
}
}
/**
* Method that checks if indexes have been loaded/opened
*
* @return
*/
public boolean loaded() {
return __VOCABULARY__ != null && __POSTINGS__ != null
&& __DOCUMENTS__ != null;
}
/**
* Get the path of index as set in themis.config file
*
* @return
*/
public String getIndexDirectory() {
if (__CONFIG__ != null) {
return __INDEX_PATH__;
} else {
__LOGGER__.error("Index has not been initialized correctly");
return "";
}
}
}
/*
* themis - A fair search engine for scientific articles
*
* Currently over the Semantic Scholar Open Research Corpus
* http://s2-public-api-prod.us-west-2.elasticbeanstalk.com/corpus/
*
* Collaborative work with the undergraduate/graduate students of
* Information Retrieval Systems (hy463) course
* Spring Semester 2020
*
* -- Writing code during COVID-19 pandemic times :-( --
*
* Aiming to participate in TREC 2020 Fair Ranking Track
* https://fair-trec.github.io/
*
* Computer Science Department http://www.csd.uoc.gr
* University of Crete
* Greece
*
* LICENCE: TO BE ADDED
*
* Copyright 2020
*
*/
package gr.csd.uoc.hy463.themis.indexer.indexes;
import gr.csd.uoc.hy463.themis.config.Config;
import java.util.TreeMap;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
/**
* This class holds all information related to a specific (partial or not) index
* in memory. It also knows how to store this information in files
*
* @author Panagiotis Papadakos (papadako@ics.forth.gr)
*/
public class Index {
// Partial indexes have an id > 0 and corresponding idx files are stored in
// INDEX_PATH/id while for a full index, idx files are stored in INDEX_PATH
// e.g., the first partial index files are saved to INDEX_PATH/1/
private int id = 0; // the id of the index that is used for partial indexes
private static final Logger __LOGGER__ = LogManager.getLogger(Index.class);
private Config __CONFIG__; // configuration options
// The path of index
private String __INDEX_PATH__ = null;
// Filenames of indexes
private String __VOCABULARY_FILENAME__ = null;
private String __POSTINGS_FILENAME__ = null;
private String __DOCUMENTS_FILENAME__ = null;
// We also need to store any information about the vocabulary,
// posting and document file in memory
// For example a TreeMap holds entries sorted which helps with storing the
// vocabulary file
private TreeMap<String, Integer> __VOCABULARY__ = null;
// We have to hold also other appropriate data structures for postings / documents
public Index(Config config) {
__CONFIG__ = config;
init();
}
/**
* Initialize things
*/
private void init() {
__VOCABULARY_FILENAME__ = __CONFIG__.getVocabularyFileName();
__POSTINGS_FILENAME__ = __CONFIG__.getPostingsFileName();
__DOCUMENTS_FILENAME__ = __CONFIG__.getDocumentsFileName();
__INDEX_PATH__ = __CONFIG__.getIndexPath();
}
/**
* This method is responsible for dumping all information held by this index
* to the filesystem in the directory INDEX_PATH/id. If id = 0 then it dumps
* every idx files to the INDEX_PATH
*
* Specifically, it creates:
*
* =========================================================================
* 1) VOCABULARY FILE => vocabulary.idx (Normal Sequential file)
*
* This is a normal sequential file where we write in lexicographic order
* the following entries separated with space: | TERM (a term of the
* vocabulary) | DF document frequency of this term | POINTER_TO_POSTING
* (the offset in the posting.idx) |
*
* =========================================================================
* 2) POSTING FILE => posting.idx (Random Access File)
*
* For each entry it stores: |DOCUMENT_ID (40 ASCII chars - 40 bytes)| |TF
* (int 4 bytes) | POINTER_TO_DOCUMENT_FILE (long 4 bytes)
*
* =========================================================================
* 3) DOCUMENTS FILE => documents.idx (Random Access File)
*
* For each entry it stores: | Title (variable bytes) | Author1,Author2,
* ...,Author_k (variable size) | Year (2 bytes short)| Journal Name
* (variable bytes) | The weight of Document (double - 8 bytes)| Length of
* Document (int - 4 bytes) | PageRank Score (double - 8 bytes => this will
* be used in the second phase of the project)
*
*
* ==> IMPORTANT NOTES
*
* For strings that have a variable size, just add as an int (4 bytes)
* prefix storing the size in bytes of the string
*
* Authors are separated by a comma
*
* Author ids are also separated with a comma
*
* The weight of the document will be computed after indexing the whole
* collection by scanning the whole postings list
*
* For now add 0.0 for PageRank score (a team will be responsible for
* computing it in the second phase of the project
*
*
* @return
*/
public boolean dump() {
if (id == 0) {
// dump to INDEX_PATH
} else {
// dump to INDEX_PATH/id
}
return false;
}
public void setID(int id) {
this.id = id;
}
/**
* Returns if index is partial
*
* @return
*/
public boolean isPartial() {
return id != 0;
}
}
/*
* themis - A fair search engine for scientific articles
*
* Currently over the Semantic Scholar Open Research Corpus
* http://s2-public-api-prod.us-west-2.elasticbeanstalk.com/corpus/
*
* Collaborative work with the undergraduate/graduate students of
* Information Retrieval Systems (hy463) course
* Spring Semester 2020
*
* -- Writing code during COVID-19 pandemic times :-( --
*
* Aiming to participate in TREC 2020 Fair Ranking Track
* https://fair-trec.github.io/
*
* Computer Science Department http://www.csd.uoc.gr
* University of Crete
* Greece
*
* LICENCE: TO BE ADDED
*
* Copyright 2020
*
*/
package gr.csd.uoc.hy463.themis.indexer.model;
import java.util.HashMap;
import java.util.Map;
/**
* This class holds any information we might want to communicate with the
* retrieval model we are implementing about a specific document
*
* Currently just holds the important things.
*
* Can also be extended by another class, DocInfoFull, that will be used
* whenever we want to get all information related with a document, i.e. the
* entry of a document in the Documents file
*
* Since probably we are going to store in memory a lot of these objects, we
* have to be as memory efficient as we can. This implementation with a map is
* worst than just keeping all properties as primitives and private members but
* seems to be simpler to interact with
*
* ID and offset are set only in the constructor
*
* @author Panagiotis Papadakos <papadako at ics.forth.gr>
*/
public class DocInfoEssential {
public enum PROPERTY {
PAGERANK, // pagerank score for 2nd phase (Value should be double)
WEIGHT, // weight (norm) of document VSM (Value should be double)
LENGTH // for OkapiBM25 (Value should be integer)
}
protected String id = ""; // the 40 byte id
protected long offset = 0; // offset in documents file
// The size of the hashmap is only 3.... since up to now we have 3 properties to hold
protected final Map<PROPERTY, Object> props = new HashMap<>(3);
public DocInfoEssential(String id, long offset) {
this.id = id;
this.offset = offset;
}
/**
* Set property for this docID. Properties come from the PROPERY enum and
* value is an object
*
* @param prop
* @param value
*/
public void setProperty(DocInfoEssential.PROPERTY prop, Object value) {
props.put(prop, value);
}
/**
* Return the value of the property
*
* @param prop
* @return
*/
public Object getProperty(DocInfoEssential.PROPERTY prop) {
return props.get(prop);
}
public String getId() {
return id;
}
public long getOffset() {
return offset;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
DocInfoEssential other = (DocInfoEssential) o;
return this.id == other.id;
}
@Override
public int hashCode() {
return id.hashCode();
}
}
/*
* themis - A fair search engine for scientific articles
*
* Currently over the Semantic Scholar Open Research Corpus
* http://s2-public-api-prod.us-west-2.elasticbeanstalk.com/corpus/
*
* Collaborative work with the undergraduate/graduate students of
* Information Retrieval Systems (hy463) course
* Spring Semester 2020
*
* -- Writing code during COVID-19 pandemic times :-( --
*
* Aiming to participate in TREC 2020 Fair Ranking Track
* https://fair-trec.github.io/
*
* Computer Science Department http://www.csd.uoc.gr
* University of Crete
* Greece
*
* LICENCE: TO BE ADDED
*
* Copyright 2020
*
*/
package gr.csd.uoc.hy463.themis.indexer.model;
/**
* This class could be used when we want to get all information of a specific
* document, etc. title, authors, etc.
*
* @author Panagiotis Papadakos <papadako at ics.forth.gr>
*/
public class DocInfoFull extends DocInfoEssential {